"tests/vscode:/vscode.git/clone" did not exist on "66652e8082b69ba7d1e6aca7c234433de55f1b9b"
Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
question:{{ (messages | selectattr("role", "eq", "query") | first).content }}
passage:{{ (messages | selectattr("role", "eq", "document") | first).content }}
\ No newline at end of file
<|im_start|>system
Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
<|im_start|>user
<Instruct>: {{ messages | selectattr("role", "eq", "system") | map(attribute="content") | first | default("Given a web search query, retrieve relevant passages that answer the query") }}
<Query>: {{ messages | selectattr("role", "eq", "query") | map(attribute="content") | first }}
<Document>: {{ messages | selectattr("role", "eq", "document") | map(attribute="content") | first }}<|im_end|>
<|im_start|>assistant
<think>
</think>
<|im_start|>system
Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
<|im_start|>user
<Instruct>: {{
messages
| selectattr("role", "eq", "system")
| map(attribute="content")
| first
| default("Given a search query, retrieve relevant candidates that answer the query.")
}}<Query>:{{
messages
| selectattr("role", "eq", "query")
| map(attribute="content")
| first
}}
<Document>:{{
messages
| selectattr("role", "eq", "document")
| map(attribute="content")
| first
}}<|im_end|>
<|im_start|>assistant
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
from argparse import Namespace
from pathlib import Path
from typing import Any
from vllm import LLM, EngineArgs
from vllm.utils.argparse_utils import FlexibleArgumentParser
def parse_args():
"""Parse command line arguments for the reranking example.
This function sets up the argument parser with default values
specific to reranking models, including the model name and
runner type.
"""
parser = FlexibleArgumentParser()
# Add all EngineArgs command line arguments to the parser
parser = EngineArgs.add_cli_args(parser)
# Set default values specific to this reranking example
# These defaults ensure the script works out-of-the-box for reranking tasks
parser.set_defaults(
model="nvidia/llama-nemotron-rerank-1b-v2", # Default reranking model
runner="pooling", # Required for cross-encoder/reranking models
trust_remote_code=True, # Allow loading models with custom code
)
return parser.parse_args()
def get_chat_template(model: str) -> str:
"""Load the appropriate chat template for the specified model.
Reranking models require specific prompt templates to format
query-document pairs correctly. This function maps model names
to their corresponding template files.
"""
# Directory containing all chat template files
template_home = Path(__file__).parent / "template"
# Mapping from model names to their corresponding template files
# Each reranking model has its own specific prompt format
model_name_to_template_path_map = {
"BAAI/bge-reranker-v2-gemma": "bge-reranker-v2-gemma.jinja",
"Qwen/Qwen3-Reranker-0.6B": "qwen3_reranker.jinja",
"Qwen/Qwen3-Reranker-4B": "qwen3_reranker.jinja",
"Qwen/Qwen3-Reranker-8B": "qwen3_reranker.jinja",
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls": "qwen3_reranker.jinja",
"tomaarsen/Qwen3-Reranker-4B-seq-cls": "qwen3_reranker.jinja",
"tomaarsen/Qwen3-Reranker-8B-seq-cls": "qwen3_reranker.jinja",
"mixedbread-ai/mxbai-rerank-base-v2": "mxbai_rerank_v2.jinja",
"mixedbread-ai/mxbai-rerank-large-v2": "mxbai_rerank_v2.jinja",
"nvidia/llama-nemotron-rerank-1b-v2": "nemotron-rerank.jinja",
}
# Get the template filename for the specified model
template_path = model_name_to_template_path_map.get(model)
if template_path is None:
raise ValueError(f"This demo does not support model name: {model}.")
# Read and return the template content
return (template_home / template_path).read_text()
def get_hf_overrides(model: str) -> dict[str, Any]:
"""Convert Large Language Models (LLMs) to Sequence Classification models.
note:
Some reranking models require special configuration overrides to work
correctly with vLLM's score API.
Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/qwen3_reranker_offline.py
Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py
"""
model_name_to_hf_overrides_map = {
"BAAI/bge-reranker-v2-gemma": {
"architectures": ["GemmaForSequenceClassification"],
"classifier_from_token": ["Yes"],
"method": "no_post_processing",
},
"Qwen/Qwen3-Reranker-0.6B": {
"architectures": ["Qwen3ForSequenceClassification"],
"classifier_from_token": ["no", "yes"],
"is_original_qwen3_reranker": True,
},
"Qwen/Qwen3-Reranker-4B": {
"architectures": ["Qwen3ForSequenceClassification"],
"classifier_from_token": ["no", "yes"],
"is_original_qwen3_reranker": True,
},
"Qwen/Qwen3-Reranker-8B": {
"architectures": ["Qwen3ForSequenceClassification"],
"classifier_from_token": ["no", "yes"],
"is_original_qwen3_reranker": True,
},
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls": {},
"tomaarsen/Qwen3-Reranker-4B-seq-cls": {},
"tomaarsen/Qwen3-Reranker-8B-seq-cls": {},
"mixedbread-ai/mxbai-rerank-base-v2": {
"architectures": ["Qwen2ForSequenceClassification"],
"classifier_from_token": ["0", "1"],
"method": "from_2_way_softmax",
},
"mixedbread-ai/mxbai-rerank-large-v2": {
"architectures": ["Qwen2ForSequenceClassification"],
"classifier_from_token": ["0", "1"],
"method": "from_2_way_softmax",
},
"nvidia/llama-nemotron-rerank-1b-v2": {},
}
hf_overrides = model_name_to_hf_overrides_map.get(model)
if hf_overrides is None:
raise ValueError(f"This demo does not support model name: {model}.")
return hf_overrides
def main(args: Namespace):
"""Main execution function for the reranking example."""
# Get the overrides for the specified model
args.hf_overrides = get_hf_overrides(args.model)
# Initialize the LLM with all provided arguments
llm = LLM(**vars(args))
# Example query for demonstration
query = "how much protein should a female eat?"
# Example documents to be reranked based on relevance to the query
documents = [
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
"Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
]
# Load the appropriate chat template for the selected model
# The template formats query-document pairs for the reranking model
chat_template = get_chat_template(args.model)
# Score documents based on relevance to the query
# The score method returns relevance scores for each document
outputs = llm.score(query, documents, chat_template=chat_template)
# Display the relevance scores
# Higher scores indicate more relevant documents
print("-" * 30)
print([output.outputs.score for output in outputs])
print("-" * 30)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Example of using the rerank API with template.
This script demonstrates how to interact with a vLLM server running
a reranking model via the REST API.
Before running this script, start the vLLM server with one of the
supported reranking models using the commands below.
note:
Some reranking models require special configuration overrides to work correctly
with vLLM's score API.
Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/qwen3_reranker_online.py
Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py
run:
vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}' --chat-template examples/pooling/score/template/bge-reranker-v2-gemma.jinja
vllm serve tomaarsen/Qwen3-Reranker-0.6B-seq-cls --chat-template examples/pooling/score/template/qwen3_reranker.jinja
vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}' --chat-template examples/pooling/score/template/mxbai_rerank_v2.jinja
vllm serve nvidia/llama-nemotron-rerank-1b-v2 --runner pooling --trust-remote-code --chat-template examples/pooling/score/template/nemotron-rerank.jinja
vllm serve Qwen/Qwen3-Reranker-0.6B --runner pooling --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' --chat-template examples/pooling/score/template/qwen3_reranker.jinja
"""
import json
import requests
# URL of the vLLM server's rerank endpoint
# Default vLLM server runs on localhost port 8000
url = "http://127.0.0.1:8000/rerank"
# HTTP headers for the request
headers = {"accept": "application/json", "Content-Type": "application/json"}
# Example query & documents
query = "how much protein should a female eat?"
documents = [
"As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.",
"Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.",
]
# Request payload for the rerank API
data = {
"model": "nvidia/llama-nemotron-rerank-1b-v2", # Model to use for reranking
"query": query, # The query to score documents against
"documents": documents, # List of documents to be scored
}
def main():
"""Main function to send a rerank request to the vLLM server.
This function sends a POST request to the /rerank endpoint with
the query and documents, then prints the relevance scores.
"""
# Send POST request to the vLLM server's rerank endpoint
response = requests.post(url, headers=headers, json=data)
# Check if the request was successful
if response.status_code == 200:
print("Request successful!")
# Pretty print the JSON response containing relevance scores
# The response includes scores for each document's relevance to the query
print(json.dumps(response.json(), indent=2))
else:
# Handle request failure
print(f"Request failed with status code: {response.status_code}")
print(response.text)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Example Python client for multimodal rerank API which is compatible with
Jina and Cohere https://jina.ai/reranker
Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
e.g.
vllm serve jinaai/jina-reranker-m0 --runner pooling
vllm serve Qwen/Qwen3-VL-Reranker-2B \
--runner pooling \
--max-model-len 4096 \
--hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' \
--chat-template examples/pooling/score/template/qwen3_vl_reranker.jinja
"""
import argparse
import json
import requests
headers = {"accept": "application/json", "Content-Type": "application/json"}
query = "A woman playing with her dog on a beach at sunset."
documents = {
"content": [
{
"type": "text",
"text": (
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " # noqa: E501
"as the dog offers its paw in a heartwarming display of companionship and trust." # noqa: E501
),
},
{
"type": "image_url",
"image_url": {
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
},
},
]
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
return parser.parse_args()
def main(args):
base_url = f"http://{args.host}:{args.port}"
models_url = base_url + "/v1/models"
rerank_url = base_url + "/rerank"
response = requests.get(models_url, headers=headers)
model = response.json()["data"][0]["id"]
data = {
"model": model,
"query": query,
"documents": documents,
}
response = requests.post(rerank_url, headers=headers, json=data)
# Check the response
if response.status_code == 200:
print("Request successful!")
print(json.dumps(response.json(), indent=2))
else:
print(f"Request failed with status code: {response.status_code}")
print(response.text)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
vision language reranker models for multimodal scoring tasks.
Vision language rerankers score the relevance between a text query and
multimodal documents (text + images/videos).
"""
from argparse import Namespace
from collections.abc import Callable
from dataclasses import asdict
from pathlib import Path
from typing import NamedTuple
from vllm import LLM, EngineArgs
from vllm.entrypoints.score_utils import ScoreMultiModalParam
from vllm.utils.argparse_utils import FlexibleArgumentParser
TEMPLATE_HOME = Path(__file__).parent / "template"
class RerankModelData(NamedTuple):
engine_args: EngineArgs
chat_template: str | None = None
def run_jinavl_reranker(modality: str) -> RerankModelData:
assert modality == "image"
engine_args = EngineArgs(
model="jinaai/jina-reranker-m0",
runner="pooling",
max_model_len=32768,
trust_remote_code=True,
mm_processor_kwargs={
"min_pixels": 3136,
"max_pixels": 602112,
},
limit_mm_per_prompt={modality: 1},
)
return RerankModelData(
engine_args=engine_args,
)
def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
engine_args = EngineArgs(
model="Qwen/Qwen3-VL-Reranker-2B",
runner="pooling",
max_model_len=16384,
limit_mm_per_prompt={modality: 1},
# HuggingFace model configuration overrides required for compatibility
hf_overrides={
# Manually route to sequence classification architecture
# This tells vLLM to use Qwen3VLForSequenceClassification instead of
# the default Qwen3VLForConditionalGeneration
"architectures": ["Qwen3VLForSequenceClassification"],
# Specify which token logits to extract from the language model head
# The original reranker uses "no" and "yes" token logits for scoring
"classifier_from_token": ["no", "yes"],
# Enable special handling for original Qwen3-Reranker models
# This flag triggers conversion logic that transforms the two token
# vectors into a single classification vector
"is_original_qwen3_reranker": True,
},
)
chat_template_path = "qwen3_vl_reranker.jinja"
chat_template = (TEMPLATE_HOME / chat_template_path).read_text()
return RerankModelData(
engine_args=engine_args,
chat_template=chat_template,
)
model_example_map: dict[str, Callable[[str], RerankModelData]] = {
"jinavl_reranker": run_jinavl_reranker,
"qwen3_vl_reranker": run_qwen3_vl_reranker,
}
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using vLLM for offline inference with "
"vision language reranker models for multimodal scoring tasks."
)
parser.add_argument(
"--model-name",
"-m",
type=str,
default="jinavl_reranker",
choices=model_example_map.keys(),
help="The name of the reranker model.",
)
parser.add_argument(
"--modality",
type=str,
default="image",
choices=["image", "video"],
help="Modality of the multimodal input (image or video).",
)
return parser.parse_args()
def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]:
# Sample query for testing the reranker
if modality == "image":
query = "A woman playing with her dog on a beach at sunset."
# Sample multimodal documents to be scored against the query
# Each document contains an image URL that will be fetched and processed
documents: ScoreMultiModalParam = {
"content": [
{
"type": "text",
"text": (
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " # noqa: E501
"as the dog offers its paw in a heartwarming display of companionship and trust." # noqa: E501
),
},
{
"type": "image_url",
"image_url": {
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
},
},
]
}
elif modality == "video":
query = "A girl is drawing pictures on an ipad."
# Sample video documents to be scored against the query
documents: ScoreMultiModalParam = {
"content": [
{
"type": "text",
"text": "A girl is drawing a guitar on her ipad with Apple Pencil.",
},
{
"type": "video_url",
"video_url": {
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
},
},
]
}
else:
raise ValueError(f"Unsupported modality: {modality}")
return query, documents
def main(args: Namespace):
# Run the selected reranker model
modality = args.modality
model_request = model_example_map[args.model_name](modality)
engine_args = model_request.engine_args
llm = LLM(**asdict(engine_args))
query, documents = get_multi_modal_input(modality)
outputs = llm.score(query, documents, chat_template=model_request.chat_template)
print("-" * 50)
print(f"Model: {engine_args.model}")
print(f"Modality: {modality}")
print(f"Query: {query}")
print("Relevance scores:", [output.outputs.score for output in outputs])
print("-" * 50)
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Example online usage of Score API.
Run `vllm serve <model> --runner pooling` to start up the server in vLLM.
e.g.
vllm serve jinaai/jina-reranker-m0 --runner pooling
vllm serve Qwen/Qwen3-VL-Reranker-2B \
--runner pooling \
--max-model-len 4096 \
--hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' \
--chat-template examples/pooling/score/template/qwen3_vl_reranker.jinja
"""
import argparse
import json
import pprint
import requests
headers = {"accept": "application/json", "Content-Type": "application/json"}
def post_http_request(prompt: dict, api_url: str) -> requests.Response:
headers = {"User-Agent": "Test Client"}
response = requests.post(api_url, headers=headers, json=prompt)
return response
text_1 = "slm markdown"
text_2 = {
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
},
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
},
]
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--model", type=str, default="jinaai/jina-reranker-m0")
return parser.parse_args()
def main(args):
api_url = f"http://{args.host}:{args.port}/score"
model_name = args.model
text_1 = "slm markdown"
text_2 = {
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
},
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
},
]
}
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url)
base_url = f"http://{args.host}:{args.port}"
models_url = base_url + "/v1/models"
score_url = base_url + "/score"
response = requests.get(models_url, headers=headers)
model = response.json()["data"][0]["id"]
prompt = {"model": model, "text_1": text_1, "text_2": text_2}
response = requests.post(score_url, headers=headers, json=prompt)
print("\nPrompt when text_1 is string and text_2 is a image list:")
pprint.pprint(prompt)
print("\nScore Response:")
pprint.pprint(score_response.json())
print(json.dumps(response.json(), indent=2))
if __name__ == "__main__":
......
{%- set ns = namespace(developer_content='', has_tools=false) -%}
{%- if tools is defined and tools | length > 0 -%}
{%- set ns.has_tools = true -%}
{%- endif -%}
{%- for message in messages -%}
{%- if message.role == 'developer' or message.role == 'system' -%}
<start_of_turn>user
{{ message.content }}
{%- if ns.has_tools %}
Available functions:
{%- for tool in tools %}
{%- if tool.type == 'function' %}
Function: {{ tool.function.name }}
Description: {{ tool.function.description | default('No description provided') }}
Parameters: {{ tool.function.parameters | tojson }}
{%- endif %}
{%- endfor %}
{%- endif %}
<end_of_turn>
{%- elif message.role == 'user' -%}
<start_of_turn>user
{{ message.content }}<end_of_turn>
{%- elif message.role == 'assistant' -%}
{%- if message.tool_calls is defined and message.tool_calls | length > 0 -%}
<start_of_turn>model
{%- for tool_call in message.tool_calls %}
<start_function_call>call:{{ tool_call.function.name }}{
{%- set args = tool_call.function.arguments -%}
{%- if args is string -%}
{%- set args = args | fromjson -%}
{%- endif -%}
{%- for key, value in args.items() -%}
{{ key }}:<escape>{{ value }}<escape>{% if not loop.last %},{% endif %}
{%- endfor -%}
}<end_function_call>
{%- endfor %}
<end_of_turn>
{%- else -%}
<start_of_turn>model
{{ message.content }}<end_of_turn>
{%- endif -%}
{%- elif message.role == 'tool' -%}
<start_of_turn>user
Function result for {{ message.name | default('function') }}: {{ message.content }}<end_of_turn>
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
<start_of_turn>model
{%- endif -%}
{%- set counter = namespace(index=0) -%}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{%- if messages and messages[0]['role'] == 'system' %}
{%- set system_message = messages[0]['content']|trim %}
{%- set messages = messages[1:] %}
{%- else %}
{%- set system_message = "You are a helpful assistant." %}
{%- endif %}
{%- if tools is not none %}
{%- set tool_instruction %}
You have access to the following tools. When you need to call a tool, you MUST use the following format:
<tool_call>function_name
<arg_key>parameter_name</arg_key>
<arg_value>parameter_value</arg_value>
</tool_call>
Important rules:
- Always wrap tool calls with <tool_call>...</tool_call> tags
- Put the function name on the first line after <tool_call>
- Use <arg_key> and <arg_value> tags for each parameter
- If a parameter value is a string, keep it as-is. If it's a number or boolean, convert it appropriately
- You can make multiple tool calls if needed
- If no tool is suitable, respond with regular text
Available tools:
{% endset %}
{{- tool_instruction + "\n\n" }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{%- for message in messages -%}
{%- if message['role'] == 'user' -%}
{{- '[Round ' + counter.index|string + ']\n问:' + message['content'] -}}
{%- set counter.index = counter.index + 1 -%}
{%- endif -%}
{%- if message['role'] == 'assistant' -%}
{{- '\n答:' + message['content'] -}}
{%- if (loop.last and add_generation_prompt) or not loop.last -%}
{{- '\n' -}}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
{{- '\n答:' -}}
{%- endif -%}
......@@ -80,6 +80,7 @@ plugins:
- "re:vllm\\._.*" # Internal modules
- "vllm.third_party"
- "vllm.vllm_flash_attn"
- "re:vllm\\.grpc\\..*_pb2.*" # Auto-generated protobuf files
- !ENV [API_AUTONAV_EXCLUDE, "re:^$"] # Match nothing by default
- mkdocstrings:
handlers:
......@@ -87,7 +88,8 @@ plugins:
options:
show_symbol_type_heading: true
show_symbol_type_toc: true
filters: []
filters:
- "!.*_pb2_grpc" # Exclude auto-generated gRPC stubs
summary:
modules: true
show_if_no_docstring: true
......
......@@ -6,9 +6,10 @@ requires = [
"packaging>=24.2",
"setuptools>=77.0.3,<81.0.0",
"setuptools-scm>=8.0",
"torch >= 2.7.1",
"torch == 2.9.0",
"wheel",
"jinja2",
"grpcio-tools>=1.76.0",
]
build-backend = "setuptools.build_meta"
......@@ -55,6 +56,10 @@ include = ["vllm*"]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
# Exclude generated protobuf files
"vllm/grpc/*_pb2.py" = ["ALL"]
"vllm/grpc/*_pb2_grpc.py" = ["ALL"]
"vllm/grpc/*_pb2.pyi" = ["ALL"]
[tool.ruff.lint]
select = [
......@@ -120,7 +125,7 @@ python = "./.venv"
# these files may be written in non english words
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
"vllm/third_party/*"]
"vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*"]
ignore-hidden = true
ignore-files = true
ignore-dot = true
......@@ -162,6 +167,7 @@ depthwise_seperable_CNN = "depthwise_seperable_CNN"
[tool.typos.default.extend-words]
iy = "iy"
tendencias = "tendencias"
indx = "indx"
# intel cpu features
tme = "tme"
dout = "dout"
......@@ -302,4 +308,4 @@ windo = "windo"
[tool.typos.type.vimscript.extend-words]
[tool.uv]
no-build-isolation-package = ["torch"]
no-build-isolation-package = ["torch"]
\ No newline at end of file
......@@ -4,8 +4,10 @@ ninja
packaging>=24.2
setuptools>=77.0.3,<81.0.0
setuptools-scm>=8
torch==2.9.0
torch==2.9.1
wheel
jinja2>=3.1.6
regex
build
protobuf>=6.33.2
grpcio-tools>=1.76.0
......@@ -9,7 +9,7 @@ blake3
py-cpuinfo
transformers >= 4.56.0, < 5
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer.
protobuf >= 6.30.0 # Required by LlamaTokenizer, gRPC.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp
openai >= 1.99.1 # For Responses API with reasoning content
......@@ -24,25 +24,24 @@ outlines_core == 0.2.11
# required for outlines backend disk cache
diskcache == 5.6.3
lark == 1.2.2
xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
xgrammar == 0.1.29; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
pyzmq >= 25.0.0
msgspec
gguf >= 0.17.0
mistral_common[image] >= 1.8.5
mistral_common[image] >= 1.8.8
opencv-python-headless >= 4.11.0 # required for video IO
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.12.2 # required for compressed-tensors
compressed-tensors == 0.13.0 # required for compressed-tensors
depyf==0.20.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/others/logging_configuration.md
scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu
pybase64 # fast base64 implementation
cbor2 # Required for cross-language serialization of hashable objects
......@@ -50,5 +49,7 @@ ijson # Required for mistral streaming tool parser
setproctitle # Used to set process names for better debugging and monitoring
openai-harmony >= 0.0.3 # Required for gpt-oss
anthropic == 0.71.0
model-hosting-container-standards >= 0.1.9, < 1.0.0
mcp
\ No newline at end of file
model-hosting-container-standards >= 0.1.10, < 1.0.0
mcp
grpcio>=1.76.0
grpcio-reflection>=1.76.0
\ No newline at end of file
cmake>=3.26.1
ninja
packaging>=24.2
setuptools>=77.0.3,<81.0.0
setuptools==77.0.3 # this version can reuse CMake build dir
setuptools-scm>=8
torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
......
# Common dependencies
-r common.txt
setuptools==77.0.3 # this version can reuse CMake build dir
numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
# Dependencies for CPUs
......
......@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding
# Dependencies for NVIDIA GPUs
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
torch==2.9.0
torchaudio==2.9.0
torch==2.9.1
torchaudio==2.9.1
# These must be updated alongside torch
torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.5.3
tblib
lm_eval[api]
\ No newline at end of file
......@@ -17,17 +17,17 @@ vocos # required for minicpmo_26 test
peft
pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
sentence-transformers # required for embedding tests
sentence-transformers>=5.2.0 # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.5 # required for voxtral test
mistral_common[image,audio] >= 1.8.8 # required for voxtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
lm-eval[api]>=0.4.9.2 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
transformers==4.57.3
tokenizers==0.22.0
......
......@@ -2,11 +2,11 @@
-r common.txt
--extra-index-url https://download.pytorch.org/whl/rocm6.4
torch==2.9.0
torchvision==0.24.0
torchaudio==2.9.0
torch==2.9.1
torchvision==0.24.1
torchaudio==2.9.1
triton==3.5.0
triton==3.5.1
cmake>=3.26.1,<4
packaging>=24.2
setuptools>=77.0.3,<80.0.0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment