Merge tag 'v0.8.5' into v0.8.5-ori

081057de · zhuwenwen · 7cf5d5c4 · ba41cc90 · 081057de · 081057de
Commit 081057de authored Apr 29, 2025 by zhuwenwen
20 changed files
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
    return response


-if __name__ == "__main__":
+def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", type=str, default="localhost")
    parser.add_argument("--port", type=int, default=8000)
    parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
+    return parser.parse_args()
+

-    args = parser.parse_args()
+def main(args):
    api_url = f"http://{args.host}:{args.port}/score"
    model_name = args.model

@@ -30,9 +32,9 @@ if __name__ == "__main__":
    text_2 = "The capital of Brazil is Brasilia."
    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
    score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 and text_2 are both strings:")
+    print("\nPrompt when text_1 and text_2 are both strings:")
    pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
    pprint.pprint(score_response.json())

    text_1 = "What is the capital of France?"
@@ -41,9 +43,9 @@ if __name__ == "__main__":
    ]
    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
    score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 is string and text_2 is a list:")
+    print("\nPrompt when text_1 is string and text_2 is a list:")
    pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
    pprint.pprint(score_response.json())

    text_1 = [
@@ -54,7 +56,12 @@ if __name__ == "__main__":
    ]
    prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
    score_response = post_http_request(prompt=prompt, api_url=api_url)
-    print("Prompt when text_1 and text_2 are both lists:")
+    print("\nPrompt when text_1 and text_2 are both lists:")
    pprint.pprint(prompt)
-    print("Score Response:")
+    print("\nScore Response:")
    pprint.pprint(score_response.json())
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -6,22 +6,29 @@ from openai import OpenAI
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"

-client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-models = client.models.list()
-model = models.data[0].id
-
-responses = client.embeddings.create(
-    input=[
-        "Hello my name is",
-        "The best thing about vLLM is that it supports many different models"
-    ],
-    model=model,
-)
-
-for data in responses.data:
-    print(data.embedding)  # List of float of len 4096
+
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    responses = client.embeddings.create(
+        # ruff: noqa: E501
+        input=[
+            "Hello my name is",
+            "The best thing about vLLM is that it supports many different models"
+        ],
+        model=model,
+    )
+
+    for data in responses.data:
+        print(data.embedding)  # List of float of len 4096
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_embedding_matryoshka_fy.py
+++ b/examples/online_serving/openai_embedding_matryoshka_fy.py
+# SPDX-License-Identifier: Apache-2.0
+"""Example Python client for embedding API dimensions using vLLM API server
+NOTE:
+    start a supported Matryoshka Embeddings model server with `vllm serve`, e.g.
+    vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def main():
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    responses = client.embeddings.create(
+        input=["Follow the white rabbit."],
+        model=model,
+        dimensions=32,
+    )
+
+    for data in responses.data:
+        print(data.embedding)  # List of float of len 32
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
    return response


-if __name__ == "__main__":
+def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", type=str, default="localhost")
    parser.add_argument("--port", type=int, default=8000)
@@ -25,15 +25,20 @@ if __name__ == "__main__":
                        type=str,
                        default="jason9693/Qwen2.5-1.5B-apeach")

-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def main(args):
    api_url = f"http://{args.host}:{args.port}/pooling"
    model_name = args.model

    # Input like Completions API
    prompt = {"model": model_name, "input": "vLLM is great!"}
    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    print("-" * 50)
    print("Pooling Response:")
    pprint.pprint(pooling_response.json())
+    print("-" * 50)

    # Input like Chat API
    prompt = {
@@ -50,3 +55,9 @@ if __name__ == "__main__":
    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
    print("Pooling Response:")
    pprint.pprint(pooling_response.json())
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -26,7 +26,12 @@ def sync_openai():
            model="openai/whisper-large-v3",
            language="en",
            response_format="json",
-            temperature=0.0)
+            temperature=0.0,
+            # Additional sampling params not provided by OpenAI API.
+            extra_body=dict(
+                seed=4419,
+                repetition_penalty=1.3,
+            ))
        print("transcription result:", transcription.text)



--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
+See Ray Serve LLM documentation at:
+https://docs.ray.io/en/latest/serve/llm/serving-llms.html
+
+Run `python3 ray_serve_deepseek.py` to deploy the model.
+"""
+
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+llm_config = LLMConfig(
+    model_loading_config={
+        "model_id": "deepseek",
+        # Since DeepSeek model is huge, it is recommended to pre-download
+        # the model to local disk, say /path/to/the/model and specify:
+        # model_source="/path/to/the/model"
+        "model_source": "deepseek-ai/DeepSeek-R1",
+    },
+    deployment_config={
+        "autoscaling_config": {
+            "min_replicas": 1,
+            "max_replicas": 1,
+        }
+    },
+    # Change to the accelerator type of the node
+    accelerator_type="H100",
+    runtime_env={"env_vars": {
+        "VLLM_USE_V1": "1"
+    }},
+    # Customize engine arguments as needed (e.g. vLLM engine kwargs)
+    engine_kwargs={
+        "tensor_parallel_size": 8,
+        "pipeline_parallel_size": 2,
+        "gpu_memory_utilization": 0.92,
+        "dtype": "auto",
+        "max_num_seqs": 40,
+        "max_model_len": 16384,
+        "enable_chunked_prefill": True,
+        "enable_prefix_caching": True,
+        "trust_remote_code": True,
+    },
+)
+
+# Deploy the application
+llm_app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(llm_app)
--- a/examples/tool_chat_template_llama4_json.jinja
+++ b/examples/tool_chat_template_llama4_json.jinja
+{%- macro is_array_of_type_objects(var) -%}
+    {%- if var is iterable and var is not string -%}
+        {%- set valid = true -%}
+        {%- for item in var -%}
+            {%- if 'type' not in item -%}
+                {%- set valid = false -%}
+                {%- break -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {{ valid }}
+    {%- else -%}
+        {{ false }}
+    {%- endif -%}
+{%- endmacro %}
+
+{%- macro render_message(message) %}
+    {%- if message['content'] is string %}
+        {{- message['content']|trim }}
+    {%- elif is_array_of_type_objects(data) == 'True' %}
+        {%- for content in message['content'] %}
+            {%- if content['type'] == 'image' %}
+                {{- '<|image|>' }}
+            {%- elif content['type'] == 'text' %}
+                {{- content['text']|trim }}
+            {%- endif %}
+        {%- endfor %}
+    {%- else %}
+        {{- message['content']|tojson }}
+    {%- endif %}
+{%- endmacro %}
+
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = ({ "content": "You are a helpful assistant with tool calling "
+        "capabilities. Only reply with a tool call if the function exists in the "
+        "library provided by the user. If it doesn't exist, just reply directly in "
+        "natural language. When you receive a tool call response, use the output to "
+        "format an answer to the original user question."}) %}
+{%- endif %}
+
+{%- set tool_lib_preamble = 'Tools: You have access to the following tools. You might need to use one '
+    'or more function/tool calls to fulfill the task. \n'
+    'If none are needed, then proceed to the response.\n\n'
+    'Tool Call Syntax: You can call tools using the following syntax:\n'
+    '{"name": function name, "parameters": dictionary of argument name and its value}.\n'
+    'Separate multiple function calls by "; ". Do not use variables.\n'
+    'Do not include anything else when calling the tools with the syntax above.\n\n'
+    'Here is a list of functions in JSON format that you can invoke.\n' %}
+
+{{- "<|header_start|>system<|header_end|>\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- tool_lib_preamble }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- render_message(system_message) }}
+{{ "<|eot|>\n" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0] %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|header_start|>user<|header_end|>\n\n' }}
+    {{- tool_lib_preamble }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- render_message(first_user_message) + "\n<|eot|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
+        {{- render_message(message) }}
+        {{- "\n<|eot|>" }}
+    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
+        {{- '\n<|header_start|>assistant<|header_end|>\n\n' -}}
+        {{- render_message(message) }}
+        {%- for tool_call in message.tool_calls %}
+           {{- '{"name": "' + tool_call.function.name + '", ' }}
+           {{- '"parameters": ' }}
+           {{- tool_call.function.arguments | tojson }}
+           {{- "}" }}
+        {%- endfor %}
+       {{- "\n<|eot|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "\n<|header_start|>ipython<|header_end|>\n\n" }}
+        {{- render_message(message) }}
+        {{- "\n<|eom|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '\n<|header_start|>assistant<|header_end|>\n\n' }}
+{%- endif %}
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,8 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "vllm"
 authors = [{name = "vLLM Team"}]
-license = { "file"= "LICENSE" }
+license = "Apache-2.0"
+license-files = ["LICENSE"]
 readme = "README.md"
 description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
 classifiers = [
@@ -23,7 +24,6 @@ classifiers = [
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
-    "License :: OSI Approved :: Apache Software License",
    "Intended Audience :: Developers",
    "Intended Audience :: Information Technology",
    "Intended Audience :: Science/Research",
@@ -46,8 +46,7 @@ vllm = "vllm.entrypoints.cli.main:main"

 [tool.setuptools.packages.find]
 where = ["."]
-exclude = ["benchmarks", "csrc", "docs", "examples", "tests*"]
-namespaces = false
+include = ["vllm*"]

 [tool.yapfignore]
 ignore_patterns = [
@@ -59,7 +58,8 @@ ignore_patterns = [
 line-length = 80
 exclude = [
    # External file, leaving license intact
-    "examples/other/fp8/quantizer/quantize.py"
+    "examples/other/fp8/quantizer/quantize.py",
+    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
 ]

 [tool.ruff.lint.per-file-ignores]

--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -8,7 +8,7 @@ blake3
 py-cpuinfo
 transformers >= 4.51.1
 huggingface-hub[hf_xet] >= 0.30.0  # Required for Xet downloads.
-tokenizers >= 0.19.1  # Required for Llama 3.
+tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
@@ -26,7 +26,7 @@ xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
-pyzmq
+pyzmq >= 25.0.0
 msgspec
 gguf >= 0.13.0
 importlib_metadata

--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -12,9 +12,9 @@ torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
 torchaudio==2.6.0; platform_machine == "ppc64le"

 # required for the image processor of phi3v, this must be updated alongside torch
-torchvision; platform_machine != "ppc64le"  and platform_machine != "s390x"
+torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
 torchvision==0.21.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts

 # cpu cannot use triton 3.3.0
-triton==3.2.0; platform_machine != "ppc64le"
+triton==3.2.0; platform_machine == "x86_64"
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -7,6 +7,7 @@ sphinx-togglebutton==0.3.2
 myst-parser==3.0.1
 msgspec
 cloudpickle
+commonmark # Required by sphinx-argparse when using :markdownhelp:

 # packages to install to build the documentation
 cachetools
@@ -18,6 +19,7 @@ transformers
 mistral_common >= 1.5.4
 aiohttp
 starlette
+scipy
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args

--- a/requirements/hpu.txt
+++ b/requirements/hpu.txt
@@ -9,4 +9,4 @@ numpy==1.26.4
 tabulate
 setuptools>=61
 setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
+# Dependency that able to run entrypoints test
+# pytest and its extensions
+pytest
+pytest-asyncio
+pytest-forked
+pytest-mock
+pytest-rerunfailures
+pytest-shard
+pytest-timeout
+
+
+librosa # required by audio tests in entrypoints/openai
+sentence-transformers
+numba == 0.61.2; python_version > '3.9'
+# testing utils
+awscli
+boto3
+botocore
+datasets
+ray >= 2.10.0
+peft
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
+tensorizer>=2.9.0
+lm-eval==0.4.8
+buildkite-test-collector==0.1.9
+
+lm-eval[api]==0.4.8 # required for model evaluation test
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -6,6 +6,7 @@ torch==2.6.0
 torchvision==0.21.0
 torchaudio==2.6.0

+triton==3.2
 cmake>=3.26,<4
 packaging
 setuptools>=61

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -10,6 +10,7 @@ pytest-timeout
 # testing utils
 awscli
 backoff # required for phi4mm test
+blobfile # required for kimi-vl test
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
@@ -26,14 +27,17 @@ torch==2.6.0
 torchaudio==2.6.0
 torchvision==0.21.0
 transformers_stream_generator # required for qwen-vl test
+mamba_ssm # required for plamo2 test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
-transformers==4.51.1
+transformers==4.51.3
+tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
+schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9

--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -20,25 +20,35 @@ aiosignal==1.3.1
 annotated-types==0.7.0
    # via pydantic
 anyio==4.6.2.post1
-    # via httpx
+    # via
+    #   httpx
+    #   starlette
 argcomplete==3.5.1
    # via datamodel-code-generator
+arrow==1.3.0
+    # via isoduration
 attrs==24.2.0
    # via
    #   aiohttp
+    #   hypothesis
    #   jsonlines
    #   jsonschema
+    #   pytest-subtests
    #   referencing
 audioread==3.0.1
    # via librosa
 awscli==1.35.23
    # via -r requirements/test.in
 backoff==2.2.1
-    # via -r requirements/test.in
+    # via
+    #   -r requirements/test.in
+    #   schemathesis
 bitsandbytes==0.45.3
    # via -r requirements/test.in
 black==24.10.0
    # via datamodel-code-generator
+blobfile==3.0.0
+    # via -r requirements/test.in
 boto3==1.35.57
    # via tensorizer
 botocore==1.35.57
@@ -67,11 +77,13 @@ click==8.1.7
    #   jiwer
    #   nltk
    #   ray
+    #   schemathesis
    #   typer
 colorama==0.4.6
    # via
    #   awscli
    #   sacrebleu
+    #   schemathesis
    #   tqdm-multiprocess
 contourpy==1.3.0
    # via matplotlib
@@ -109,6 +121,7 @@ einops==0.8.0
    # via
    #   -r requirements/test.in
    #   encodec
+    #   mamba-ssm
    #   vector-quantize-pytorch
    #   vocos
 einx==0.3.0
@@ -127,6 +140,7 @@ fastsafetensors==0.1.10
    # via -r requirements/test.in
 filelock==3.16.1
    # via
+    #   blobfile
    #   datasets
    #   huggingface-hub
    #   ray
@@ -134,6 +148,8 @@ filelock==3.16.1
    #   transformers
 fonttools==4.54.1
    # via matplotlib
+fqdn==1.5.1
+    # via jsonschema
 frozendict==2.4.6
    # via einx
 frozenlist==1.5.0
@@ -152,8 +168,12 @@ genai-perf==0.0.8
    # via -r requirements/test.in
 genson==1.3.0
    # via datamodel-code-generator
+graphql-core==3.2.6
+    # via hypothesis-graphql
 h11==0.14.0
    # via httpcore
+harfile==0.3.0
+    # via schemathesis
 hf-xet==0.1.4
    # via huggingface-hub
 hiredis==3.0.0
@@ -161,7 +181,9 @@ hiredis==3.0.0
 httpcore==1.0.6
    # via httpx
 httpx==0.27.2
-    # via -r requirements/test.in
+    # via
+    #   -r requirements/test.in
+    #   schemathesis
 huggingface-hub==0.30.1
    # via
    #   -r requirements/test.in
@@ -176,17 +198,29 @@ huggingface-hub==0.30.1
    #   vocos
 humanize==4.11.0
    # via runai-model-streamer
+hypothesis==6.131.0
+    # via
+    #   hypothesis-graphql
+    #   hypothesis-jsonschema
+    #   schemathesis
+hypothesis-graphql==0.11.1
+    # via schemathesis
+hypothesis-jsonschema==0.23.1
+    # via schemathesis
 idna==3.10
    # via
    #   anyio
    #   email-validator
    #   httpx
+    #   jsonschema
    #   requests
    #   yarl
 inflect==5.6.2
    # via datamodel-code-generator
 iniconfig==2.0.0
    # via pytest
+isoduration==20.11.0
+    # via jsonschema
 isort==5.13.2
    # via datamodel-code-generator
 jinja2==3.1.6
@@ -206,12 +240,18 @@ joblib==1.4.2
    #   scikit-learn
 jsonlines==4.0.0
    # via lm-eval
+jsonpointer==3.0.0
+    # via jsonschema
 jsonschema==4.23.0
    # via
+    #   hypothesis-jsonschema
    #   mistral-common
    #   ray
+    #   schemathesis
 jsonschema-specifications==2024.10.1
    # via jsonschema
+junit-xml==1.9
+    # via schemathesis
 kaleido==0.2.1
    # via genai-perf
 kiwisolver==1.4.7
@@ -227,11 +267,17 @@ llvmlite==0.44.0
 lm-eval==0.4.8
    # via -r requirements/test.in
 lxml==5.3.0
-    # via sacrebleu
+    # via
+    #   blobfile
+    #   sacrebleu
+mamba-ssm==2.2.4
+    # via -r requirements/test.in
 markdown-it-py==3.0.0
    # via rich
 markupsafe==3.0.2
-    # via jinja2
+    # via
+    #   jinja2
+    #   werkzeug
 matplotlib==3.9.2
    # via -r requirements/test.in
 mbstrdecoder==1.1.3
@@ -263,6 +309,8 @@ mypy-extensions==1.0.0
    # via black
 networkx==3.2.1
    # via torch
+ninja==1.11.1.3
+    # via mamba-ssm
 nltk==3.9.1
    # via rouge-score
 num2words==0.5.14
@@ -355,6 +403,7 @@ packaging==24.1
    #   fastparquet
    #   huggingface-hub
    #   lazy-loader
+    #   mamba-ssm
    #   matplotlib
    #   peft
    #   plotly
@@ -426,6 +475,8 @@ pybind11==2.13.6
    # via lm-eval
 pycparser==2.22
    # via cffi
+pycryptodomex==3.22.0
+    # via blobfile
 pydantic==2.9.2
    # via
    #   datamodel-code-generator
@@ -436,6 +487,8 @@ pygments==2.18.0
    # via rich
 pyparsing==3.2.0
    # via matplotlib
+pyrate-limiter==3.7.0
+    # via schemathesis
 pytablewriter==1.2.0
    # via lm-eval
 pytest==8.3.3
@@ -448,7 +501,9 @@ pytest==8.3.3
    #   pytest-mock
    #   pytest-rerunfailures
    #   pytest-shard
+    #   pytest-subtests
    #   pytest-timeout
+    #   schemathesis
 pytest-asyncio==0.24.0
    # via -r requirements/test.in
 pytest-forked==1.6.0
@@ -459,10 +514,13 @@ pytest-rerunfailures==14.0
    # via -r requirements/test.in
 pytest-shard==0.1.2
    # via -r requirements/test.in
+pytest-subtests==0.14.1
+    # via schemathesis
 pytest-timeout==2.3.1
    # via -r requirements/test.in
 python-dateutil==2.9.0.post0
    # via
+    #   arrow
    #   botocore
    #   matplotlib
    #   pandas
@@ -484,6 +542,7 @@ pyyaml==6.0.2
    #   peft
    #   ray
    #   responses
+    #   schemathesis
    #   timm
    #   transformers
    #   vocos
@@ -514,10 +573,16 @@ requests==2.32.3
    #   pooch
    #   ray
    #   responses
+    #   schemathesis
+    #   starlette-testclient
    #   tiktoken
    #   transformers
 responses==0.25.3
    # via genai-perf
+rfc3339-validator==0.1.4
+    # via jsonschema
+rfc3987==1.3.8
+    # via jsonschema
 rich==13.9.4
    # via
    #   genai-perf
@@ -546,6 +611,8 @@ safetensors==0.4.5
    #   peft
    #   timm
    #   transformers
+schemathesis==3.39.15
+    # via -r requirements/test.in
 scikit-learn==1.5.2
    # via
    #   librosa
@@ -564,18 +631,23 @@ sentencepiece==0.2.0
    # via mistral-common
 setuptools==75.8.0
    # via
+    #   mamba-ssm
    #   pytablewriter
    #   torch
 shellingham==1.5.4
    # via typer
 six==1.16.0
    # via
+    #   junit-xml
    #   python-dateutil
+    #   rfc3339-validator
    #   rouge-score
 sniffio==1.3.1
    # via
    #   anyio
    #   httpx
+sortedcontainers==2.4.0
+    # via hypothesis
 soundfile==0.12.1
    # via
    #   -r requirements/test.in
@@ -584,6 +656,12 @@ soxr==0.5.0.post1
    # via librosa
 sqlitedict==2.1.0
    # via lm-eval
+starlette==0.46.2
+    # via
+    #   schemathesis
+    #   starlette-testclient
+starlette-testclient==0.4.1
+    # via schemathesis
 statsmodels==0.14.4
    # via genai-perf
 sympy==1.13.1
@@ -610,8 +688,14 @@ tiktoken==0.7.0
    #   mistral-common
 timm==1.0.11
    # via -r requirements/test.in
-tokenizers==0.21.0
-    # via transformers
+tokenizers==0.21.1
+    # via
+    #   -r requirements/test.in
+    #   transformers
+tomli==2.2.1
+    # via schemathesis
+tomli-w==1.2.0
+    # via schemathesis
 torch==2.6.0
    # via
    #   -r requirements/test.in
@@ -620,6 +704,7 @@ torch==2.6.0
    #   encodec
    #   fastsafetensors
    #   lm-eval
+    #   mamba-ssm
    #   peft
    #   runai-model-streamer
    #   sentence-transformers
@@ -652,11 +737,12 @@ tqdm==4.66.6
    #   transformers
 tqdm-multiprocess==0.0.11
    # via lm-eval
-transformers==4.51.1
+transformers==4.51.3
    # via
    #   -r requirements/test.in
    #   genai-perf
    #   lm-eval
+    #   mamba-ssm
    #   peft
    #   sentence-transformers
    #   transformers-stream-generator
@@ -675,6 +761,8 @@ typepy==1.3.2
    #   tabledata
 typer==0.15.2
    # via fastsafetensors
+types-python-dateutil==2.9.0.20241206
+    # via arrow
 typing-extensions==4.12.2
    # via
    #   huggingface-hub
@@ -687,8 +775,11 @@ typing-extensions==4.12.2
    #   typer
 tzdata==2024.2
    # via pandas
+uri-template==1.3.0
+    # via jsonschema
 urllib3==2.2.3
    # via
+    #   blobfile
    #   botocore
    #   requests
    #   responses
@@ -697,6 +788,10 @@ vector-quantize-pytorch==1.21.2
    # via -r requirements/test.in
 vocos==0.1.0
    # via -r requirements/test.in
+webcolors==24.11.1
+    # via jsonschema
+werkzeug==3.1.3
+    # via schemathesis
 word2number==1.1
    # via lm-eval
 xxhash==3.5.0
@@ -704,6 +799,8 @@ xxhash==3.5.0
    #   datasets
    #   evaluate
 yarl==1.17.1
-    # via aiohttp
+    # via
+    #   aiohttp
+    #   schemathesis
 zstandard==0.23.0
    # via lm-eval
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,9 +17,8 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.8.0.dev20250408
+torchvision==0.22.0.dev20250408
 torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
 torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
 torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

--- a/setup.py
+++ b/setup.py
@@ -269,15 +269,17 @@ class cmake_build_ext(build_ext):
        # First, run the standard build_ext command to compile the extensions
        super().run()

-        # copy vllm/vllm_flash_attn/*.py from self.build_lib to current
+        # copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
        # directory so that they can be included in the editable build
        import glob
-        files = glob.glob(
-            os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "*.py"))
+        files = glob.glob(os.path.join(self.build_lib, "vllm",
+                                       "vllm_flash_attn", "**", "*.py"),
+                          recursive=True)
        for file in files:
            dst_file = os.path.join("vllm/vllm_flash_attn",
-                                    os.path.basename(file))
+                                    file.split("vllm/vllm_flash_attn/")[-1])
            print(f"Copying {file} to {dst_file}")
+            os.makedirs(os.path.dirname(dst_file), exist_ok=True)
            self.copy_file(file, dst_file)


@@ -377,13 +379,22 @@ class repackage_wheel(build_ext):
                "vllm/_flashmla_C.abi3.so",
                "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
-                "vllm/vllm_flash_attn/flash_attn_interface.py",
-                "vllm/vllm_flash_attn/__init__.py",
                "vllm/cumem_allocator.abi3.so",
                # "vllm/_version.py", # not available in nightly wheels yet
            ]
-            file_members = filter(lambda x: x.filename in files_to_copy,
-                                  wheel.filelist)
+
+            file_members = list(
+                filter(lambda x: x.filename in files_to_copy, wheel.filelist))
+
+            # vllm_flash_attn python code:
+            # Regex from
+            #  `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
+            import re
+            compiled_regex = re.compile(
+                r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
+            file_members += list(
+                filter(lambda x: compiled_regex.match(x.filename),
+                       wheel.filelist))

            for file in file_members:
                print(f"Extracting and including {file.filename} "

--- a/tests/benchmarks/__init__.py
+++ b/tests/benchmarks/__init__.py
--- a/tests/benchmarks/test_latency_cli.py
+++ b/tests/benchmarks/test_latency_cli.py
+# SPDX-License-Identifier: Apache-2.0
+import subprocess
+
+import pytest
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.benchmark
+def test_bench_latency():
+    command = [
+        "vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
+        "--output-len", "1", "--enforce-eager", "--load-format", "dummy"
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"