Merge tag 'v0.9.0' into v0.9.0-ori

7a985548 · zhuwenwen · 45d3785c · dc1440cf · 7a985548 · 7a985548
Commit 7a985548 authored May 22, 2025 by zhuwenwen
20 changed files
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -8,7 +8,7 @@ with the reasoning parser:
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-     --enable-reasoning --reasoning-parser deepseek_r1
+    --reasoning-parser deepseek_r1
 ```
 This example demonstrates how to generate chat completions from reasoning models

--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -8,7 +8,7 @@ parser:
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-     --enable-reasoning --reasoning-parser deepseek_r1
+     --reasoning-parser deepseek_r1
 ```
 Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the

--- a/examples/online_serving/openai_classification_client.py
+++ b/examples/online_serving/openai_classification_client.py
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import pprint
+import requests
+def post_http_request(payload: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=payload)
+    return response
+def parse_args():
+    parse = argparse.ArgumentParser()
+    parse.add_argument("--host", type=str, default="localhost")
+    parse.add_argument("--port", type=int, default=8000)
+    parse.add_argument("--model",
+                       type=str,
+                       default="jason9693/Qwen2.5-1.5B-apeach")
+    return parse.parse_args()
+def main(args):
+    host = args.host
+    port = args.port
+    model_name = args.model
+    api_url = f"http://{host}:{port}/classify"
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    payload = {
+        "model": model_name,
+        "input": prompts,
+    }
+    classify_response = post_http_request(payload=payload, api_url=api_url)
+    pprint.pprint(classify_response.json())
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -46,11 +46,15 @@ async def stream_openai_response():
        "model": "openai/whisper-large-v3",
    }
    url = openai_api_base + "/audio/transcriptions"
+    headers = {"Authorization": f"Bearer {openai_api_key}"}
    print("transcription result:", end=' ')
    async with httpx.AsyncClient() as client:
        with open(str(winning_call), "rb") as f:
-            async with client.stream('POST', url, files={'file': f},
+            async with client.stream('POST',
-                                     data=data) as response:
+                                     url,
+                                     files={'file': f},
+                                     data=data,
+                                     headers=headers) as response:
                async for line in response.aiter_lines():
                    # Each line is a JSON object prefixed with 'data: '
                    if line:

--- a/examples/online_serving/opentelemetry/Otel.md
+++ b/examples/online_serving/opentelemetry/Otel.md
--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
 # SPDX-License-Identifier: Apache-2.0
 """
 Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
-See Ray Serve LLM documentation at:
+See more details at:
+https://docs.ray.io/en/latest/serve/tutorials/serve-deepseek.html
+And see Ray Serve LLM documentation at:
 https://docs.ray.io/en/latest/serve/llm/serving-llms.html
 Run `python3 ray_serve_deepseek.py` to deploy the model.

--- a/examples/online_serving/retrieval_augmented_generation_with_langchain.py
+++ b/examples/online_serving/retrieval_augmented_generation_with_langchain.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+Retrieval Augmented Generation (RAG) Implementation with Langchain
+==================================================================
+This script demonstrates a RAG implementation using LangChain, Milvus
+and vLLM. RAG enhances LLM responses by retrieving relevant context
+from a document collection.
+Features:
+- Web content loading and chunking
+- Vector storage with Milvus
+- Embedding generation with vLLM
+- Question answering with context
+Prerequisites:
+1. Install dependencies:
+    pip install -U vllm \
+                 langchain_milvus langchain_openai \
+                 langchain_community beautifulsoup4 \
+                 langchain-text-splitters
+2. Start services:
+    # Start embedding service (port 8000)
+    vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+    # Start chat service (port 8001)
+    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+Usage:
+    python retrieval_augmented_generation_with_langchain.py
+Notes:
+    - Ensure both vLLM services are running before executing
+    - Default ports: 8000 (embedding), 8001 (chat)
+    - First run may take time to download models
+"""
+import argparse
+from argparse import Namespace
+from typing import Any
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_core.documents import Document
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import PromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_milvus import Milvus
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+def load_and_split_documents(config: dict[str, Any]):
+    """
+    Load and split documents from web URL
+    """
+    try:
+        loader = WebBaseLoader(web_paths=(config["url"], ))
+        docs = loader.load()
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=config["chunk_size"],
+            chunk_overlap=config["chunk_overlap"],
+        )
+        return text_splitter.split_documents(docs)
+    except Exception as e:
+        print(f"Error loading document from {config['url']}: {str(e)}")
+        raise
+def init_vectorstore(config: dict[str, Any], documents: list[Document]):
+    """
+    Initialize vector store with documents
+    """
+    return Milvus.from_documents(
+        documents=documents,
+        embedding=OpenAIEmbeddings(
+            model=config["embedding_model"],
+            openai_api_key=config["vllm_api_key"],
+            openai_api_base=config["vllm_embedding_endpoint"],
+        ),
+        connection_args={"uri": config["uri"]},
+        drop_old=True,
+    )
+def init_llm(config: dict[str, Any]):
+    """
+    Initialize llm
+    """
+    return ChatOpenAI(
+        model=config["chat_model"],
+        openai_api_key=config["vllm_api_key"],
+        openai_api_base=config["vllm_chat_endpoint"],
+    )
+def get_qa_prompt():
+    """
+    Get question answering prompt template
+    """
+    template = """You are an assistant for question-answering tasks.
+Use the following pieces of retrieved context to answer the question.
+If you don't know the answer, just say that you don't know.
+Use three sentences maximum and keep the answer concise.
+Question: {question}
+Context: {context}
+Answer:
+"""
+    return PromptTemplate.from_template(template)
+def format_docs(docs: list[Document]):
+    """
+    Format documents for prompt
+    """
+    return "\n\n".join(doc.page_content for doc in docs)
+def create_qa_chain(retriever: Any, llm: ChatOpenAI, prompt: PromptTemplate):
+    """
+    Set up question answering chain
+    """
+    return ({
+        "context": retriever | format_docs,
+        "question": RunnablePassthrough(),
+    }
+            | prompt
+            | llm
+            | StrOutputParser())
+def get_parser() -> argparse.ArgumentParser:
+    """
+    Parse command line arguments
+    """
+    parser = argparse.ArgumentParser(description='RAG with vLLM and langchain')
+    # Add command line arguments
+    parser.add_argument('--vllm-api-key',
+                        default="EMPTY",
+                        help='API key for vLLM compatible services')
+    parser.add_argument('--vllm-embedding-endpoint',
+                        default="http://localhost:8000/v1",
+                        help='Base URL for embedding service')
+    parser.add_argument('--vllm-chat-endpoint',
+                        default="http://localhost:8001/v1",
+                        help='Base URL for chat service')
+    parser.add_argument('--uri',
+                        default="./milvus.db",
+                        help='URI for Milvus database')
+    parser.add_argument(
+        '--url',
+        default=("https://docs.vllm.ai/en/latest/getting_started/"
+                 "quickstart.html"),
+        help='URL of the document to process')
+    parser.add_argument('--embedding-model',
+                        default="ssmits/Qwen2-7B-Instruct-embed-base",
+                        help='Model name for embeddings')
+    parser.add_argument('--chat-model',
+                        default="qwen/Qwen1.5-0.5B-Chat",
+                        help='Model name for chat')
+    parser.add_argument('-i',
+                        '--interactive',
+                        action='store_true',
+                        help='Enable interactive Q&A mode')
+    parser.add_argument('-k',
+                        '--top-k',
+                        type=int,
+                        default=3,
+                        help='Number of top results to retrieve')
+    parser.add_argument('-c',
+                        '--chunk-size',
+                        type=int,
+                        default=1000,
+                        help='Chunk size for document splitting')
+    parser.add_argument('-o',
+                        '--chunk-overlap',
+                        type=int,
+                        default=200,
+                        help='Chunk overlap for document splitting')
+    return parser
+def init_config(args: Namespace):
+    """
+    Initialize configuration settings from command line arguments
+    """
+    return {
+        "vllm_api_key": args.vllm_api_key,
+        "vllm_embedding_endpoint": args.vllm_embedding_endpoint,
+        "vllm_chat_endpoint": args.vllm_chat_endpoint,
+        "uri": args.uri,
+        "embedding_model": args.embedding_model,
+        "chat_model": args.chat_model,
+        "url": args.url,
+        "chunk_size": args.chunk_size,
+        "chunk_overlap": args.chunk_overlap,
+        "top_k": args.top_k
+    }
+def main():
+    # Parse command line arguments
+    args = get_parser().parse_args()
+    # Initialize configuration
+    config = init_config(args)
+    # Load and split documents
+    documents = load_and_split_documents(config)
+    # Initialize vector store and retriever
+    vectorstore = init_vectorstore(config, documents)
+    retriever = vectorstore.as_retriever(search_kwargs={"k": config["top_k"]})
+    # Initialize llm and prompt
+    llm = init_llm(config)
+    prompt = get_qa_prompt()
+    # Set up QA chain
+    qa_chain = create_qa_chain(retriever, llm, prompt)
+    # Interactive mode
+    if args.interactive:
+        print("\nWelcome to Interactive Q&A System!")
+        print("Enter 'q' or 'quit' to exit.")
+        while True:
+            question = input("\nPlease enter your question: ")
+            if question.lower() in ['q', 'quit']:
+                print("\nThank you for using! Goodbye!")
+                break
+            output = qa_chain.invoke(question)
+            print(output)
+    else:
+        # Default single question mode
+        question = ("How to install vLLM?")
+        output = qa_chain.invoke(question)
+        print("-" * 50)
+        print(output)
+        print("-" * 50)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
+++ b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+RAG (Retrieval Augmented Generation) Implementation with LlamaIndex
+================================================================
+This script demonstrates a RAG system using:
+- LlamaIndex: For document indexing and retrieval
+- Milvus: As vector store backend
+- vLLM: For embedding and text generation
+Features:
+1. Document Loading & Processing
+2. Embedding & Storage
+3. Query Processing
+Requirements:
+1. Install dependencies:
+pip install llama-index llama-index-readers-web \
+            llama-index-llms-openai-like    \
+            llama-index-embeddings-openai-like \
+            llama-index-vector-stores-milvus \
+2. Start services:
+    # Start embedding service (port 8000)
+    vllm serve ssmits/Qwen2-7B-Instruct-embed-base
+    # Start chat service (port 8001)
+    vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
+Usage:
+    python retrieval_augmented_generation_with_llamaindex.py
+Notes:
+    - Ensure both vLLM services are running before executing
+    - Default ports: 8000 (embedding), 8001 (chat)
+    - First run may take time to download models
+"""
+import argparse
+from argparse import Namespace
+from typing import Any
+from llama_index.core import Settings, StorageContext, VectorStoreIndex
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.embeddings.openai_like import OpenAILikeEmbedding
+from llama_index.llms.openai_like import OpenAILike
+from llama_index.readers.web import SimpleWebPageReader
+from llama_index.vector_stores.milvus import MilvusVectorStore
+def init_config(args: Namespace):
+    """Initialize configuration with command line arguments"""
+    return {
+        "url": args.url,
+        "embedding_model": args.embedding_model,
+        "chat_model": args.chat_model,
+        "vllm_api_key": args.vllm_api_key,
+        "embedding_endpoint": args.embedding_endpoint,
+        "chat_endpoint": args.chat_endpoint,
+        "db_path": args.db_path,
+        "chunk_size": args.chunk_size,
+        "chunk_overlap": args.chunk_overlap,
+        "top_k": args.top_k
+    }
+def load_documents(url: str) -> list:
+    """Load and process web documents"""
+    return SimpleWebPageReader(html_to_text=True).load_data([url])
+def setup_models(config: dict[str, Any]):
+    """Configure embedding and chat models"""
+    Settings.embed_model = OpenAILikeEmbedding(
+        api_base=config["embedding_endpoint"],
+        api_key=config["vllm_api_key"],
+        model_name=config["embedding_model"],
+    )
+    Settings.llm = OpenAILike(
+        model=config["chat_model"],
+        api_key=config["vllm_api_key"],
+        api_base=config["chat_endpoint"],
+        context_window=128000,
+        is_chat_model=True,
+        is_function_calling_model=False,
+    )
+    Settings.transformations = [
+        SentenceSplitter(
+            chunk_size=config["chunk_size"],
+            chunk_overlap=config["chunk_overlap"],
+        )
+    ]
+def setup_vector_store(db_path: str) -> MilvusVectorStore:
+    """Initialize vector store"""
+    sample_emb = Settings.embed_model.get_text_embedding("test")
+    print(f"Embedding dimension: {len(sample_emb)}")
+    return MilvusVectorStore(uri=db_path, dim=len(sample_emb), overwrite=True)
+def create_index(documents: list, vector_store: MilvusVectorStore):
+    """Create document index"""
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    return VectorStoreIndex.from_documents(
+        documents,
+        storage_context=storage_context,
+    )
+def query_document(index: VectorStoreIndex, question: str, top_k: int):
+    """Query document with given question"""
+    query_engine = index.as_query_engine(similarity_top_k=top_k)
+    return query_engine.query(question)
+def get_parser() -> argparse.ArgumentParser:
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(
+        description='RAG with vLLM and LlamaIndex')
+    # Add command line arguments
+    parser.add_argument(
+        '--url',
+        default=("https://docs.vllm.ai/en/latest/getting_started/"
+                 "quickstart.html"),
+        help='URL of the document to process')
+    parser.add_argument('--embedding-model',
+                        default="ssmits/Qwen2-7B-Instruct-embed-base",
+                        help='Model name for embeddings')
+    parser.add_argument('--chat-model',
+                        default="qwen/Qwen1.5-0.5B-Chat",
+                        help='Model name for chat')
+    parser.add_argument('--vllm-api-key',
+                        default="EMPTY",
+                        help='API key for vLLM compatible services')
+    parser.add_argument('--embedding-endpoint',
+                        default="http://localhost:8000/v1",
+                        help='Base URL for embedding service')
+    parser.add_argument('--chat-endpoint',
+                        default="http://localhost:8001/v1",
+                        help='Base URL for chat service')
+    parser.add_argument('--db-path',
+                        default="./milvus_demo.db",
+                        help='Path to Milvus database')
+    parser.add_argument('-i',
+                        '--interactive',
+                        action='store_true',
+                        help='Enable interactive Q&A mode')
+    parser.add_argument('-c',
+                        '--chunk-size',
+                        type=int,
+                        default=1000,
+                        help='Chunk size for document splitting')
+    parser.add_argument('-o',
+                        '--chunk-overlap',
+                        type=int,
+                        default=200,
+                        help='Chunk overlap for document splitting')
+    parser.add_argument('-k',
+                        '--top-k',
+                        type=int,
+                        default=3,
+                        help='Number of top results to retrieve')
+    return parser
+def main():
+    # Parse command line arguments
+    args = get_parser().parse_args()
+    # Initialize configuration
+    config = init_config(args)
+    # Load documents
+    documents = load_documents(config["url"])
+    # Setup models
+    setup_models(config)
+    # Setup vector store
+    vector_store = setup_vector_store(config["db_path"])
+    # Create index
+    index = create_index(documents, vector_store)
+    if args.interactive:
+        print("\nEntering interactive mode. Type 'quit' to exit.")
+        while True:
+            # Get user question
+            question = input("\nEnter your question: ")
+            # Check for exit command
+            if question.lower() in ['quit', 'exit', 'q']:
+                print("Exiting interactive mode...")
+                break
+            # Get and print response
+            print("\n" + "-" * 50)
+            print("Response:\n")
+            response = query_document(index, question, config["top_k"])
+            print(response)
+            print("-" * 50)
+    else:
+        # Single query mode
+        question = "How to install vLLM?"
+        response = query_document(index, question, config["top_k"])
+        print("-" * 50)
+        print("Response:\n")
+        print(response)
+        print("-" * 50)
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/streamlit_openai_chatbot_webserver.py
+++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+vLLM Chat Assistant - A Streamlit Web Interface
+A streamlined chat interface that quickly integrates
+with vLLM API server.
+Features:
+- Multiple chat sessions management
+- Streaming response display
+- Configurable API endpoint
+- Real-time chat history
+Requirements:
+    pip install streamlit openai
+Usage:
+    # Start the app with default settings
+    streamlit run streamlit_openai_chatbot_webserver.py
+    # Start with custom vLLM API endpoint
+    VLLM_API_BASE="http://your-server:8000/v1" \
+        streamlit run streamlit_openai_chatbot_webserver.py
+    # Enable debug mode
+    streamlit run streamlit_openai_chatbot_webserver.py \
+        --logger.level=debug
+"""
+import os
+from datetime import datetime
+import streamlit as st
+from openai import OpenAI
+# Get command line arguments from environment variables
+openai_api_key = os.getenv('VLLM_API_KEY', "EMPTY")
+openai_api_base = os.getenv('VLLM_API_BASE', "http://localhost:8000/v1")
+# Initialize session states for managing chat sessions
+if "sessions" not in st.session_state:
+    st.session_state.sessions = {}
+if "current_session" not in st.session_state:
+    st.session_state.current_session = None
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if "active_session" not in st.session_state:
+    st.session_state.active_session = None
+# Initialize session state for API base URL
+if "api_base_url" not in st.session_state:
+    st.session_state.api_base_url = openai_api_base
+def create_new_chat_session():
+    """Create a new chat session with timestamp as ID"""
+    session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    st.session_state.sessions[session_id] = []
+    st.session_state.current_session = session_id
+    st.session_state.active_session = session_id
+    st.session_state.messages = []
+def switch_to_chat_session(session_id):
+    """Switch to a different chat session"""
+    st.session_state.current_session = session_id
+    st.session_state.active_session = session_id
+    st.session_state.messages = st.session_state.sessions[session_id]
+def get_llm_response(messages, model):
+    """Get streaming response from llm
+    Args:
+        messages: List of message dictionaries
+        model: Name of model
+    Returns:
+        Streaming response object or error message string
+    """
+    try:
+        response = client.chat.completions.create(model=model,
+                                                  messages=messages,
+                                                  stream=True)
+        return response
+    except Exception as e:
+        st.error(f"Error details: {str(e)}")
+        return f"Error: {str(e)}"
+# Sidebar - API Settings first
+st.sidebar.title("API Settings")
+new_api_base = st.sidebar.text_input("API Base URL:",
+                                     value=st.session_state.api_base_url)
+if new_api_base != st.session_state.api_base_url:
+    st.session_state.api_base_url = new_api_base
+    st.rerun()
+st.sidebar.divider()
+# Sidebar - Session Management
+st.sidebar.title("Chat Sessions")
+if st.sidebar.button("New Session"):
+    create_new_chat_session()
+# Display all sessions in reverse chronological order
+for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
+    # Mark the active session with a pinned button
+    if session_id == st.session_state.active_session:
+        st.sidebar.button(f"📍 {session_id}",
+                          key=session_id,
+                          type="primary",
+                          on_click=switch_to_chat_session,
+                          args=(session_id, ))
+    else:
+        st.sidebar.button(f"Session {session_id}",
+                          key=session_id,
+                          on_click=switch_to_chat_session,
+                          args=(session_id, ))
+# Main interface
+st.title("vLLM Chat Assistant")
+# Initialize OpenAI client with API settings
+client = OpenAI(api_key=openai_api_key, base_url=st.session_state.api_base_url)
+# Get and display current model id
+models = client.models.list()
+model = models.data[0].id
+st.markdown(f"**Model**: {model}")
+# Initialize first session if none exists
+if st.session_state.current_session is None:
+    create_new_chat_session()
+    st.session_state.active_session = st.session_state.current_session
+# Display chat history for current session
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.write(message["content"])
+# Handle user input and generate llm response
+if prompt := st.chat_input("Type your message here..."):
+    # Save user message to session
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    st.session_state.sessions[
+        st.session_state.current_session] = st.session_state.messages
+    # Display user message
+    with st.chat_message("user"):
+        st.write(prompt)
+    # Prepare messages for llm
+    messages_for_llm = [{
+        "role": m["role"],
+        "content": m["content"]
+    } for m in st.session_state.messages]
+    # Generate and display llm response
+    with st.chat_message("assistant"):
+        message_placeholder = st.empty()
+        full_response = ""
+        # Get streaming response from llm
+        response = get_llm_response(messages_for_llm, model)
+        if isinstance(response, str):
+            message_placeholder.markdown(response)
+            full_response = response
+        else:
+            for chunk in response:
+                if hasattr(chunk.choices[0].delta, "content"):
+                    content = chunk.choices[0].delta.content
+                    if content:
+                        full_response += content
+                        message_placeholder.markdown(full_response + "▌")
+            message_placeholder.markdown(full_response)
+    # Save llm response to session history
+    st.session_state.messages.append({
+        "role": "assistant",
+        "content": full_response
+    })
--- a/examples/online_serving/utils.py
+++ b/examples/online_serving/utils.py
+# SPDX-License-Identifier: Apache-2.0
+from openai import APIConnectionError, OpenAI
+from openai.pagination import SyncPage
+from openai.types.model import Model
+def get_first_model(client: OpenAI) -> str:
+    """
+    Get the first model from the vLLM server.
+    """
+    try:
+        models: SyncPage[Model] = client.models.list()
+    except APIConnectionError as e:
+        raise RuntimeError(
+            "Failed to get the list of models from the vLLM server at "
+            f"{client.base_url} with API key {client.api_key}. Check\n"
+            "1. the server is running\n"
+            "2. the server URL is correct\n"
+            "3. the API key is correct") from e
+    if len(models.data) == 0:
+        raise RuntimeError(
+            f"No models found on the vLLM server at {client.base_url}")
+    return models.data[0].id
--- a/examples/template_florence2.jinja
+++ b/examples/template_florence2.jinja
-{%- for message in messages -%}
-    {%- if message['role'] == 'user' -%}
-        {{- message['content'] -}}
-    {%- elif message['role'] == 'assistant' -%}
-        {{- message['content'] -}}
-    {%- endif -%}
-{%- endfor -%}
--- a/examples/template_llava.jinja
+++ b/examples/template_llava.jinja
-{%- if messages[0]['role'] == 'system' -%}
-    {%- set system_message = messages[0]['content'] -%}
-    {%- set messages = messages[1:] -%}
-{%- else -%}
-    {% set system_message = '' -%}
-{%- endif -%}
-{{ bos_token + system_message }}
-{%- for message in messages -%}
-    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
-        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
-    {%- endif -%}
-    {%- if message['role'] == 'user' -%}
-        {{ 'USER: ' + message['content'] + '\n' }}
-    {%- elif message['role'] == 'assistant' -%}
-        {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
-    {%- endif -%}
-{%- endfor -%}
-{%- if add_generation_prompt -%}
-    {{ 'ASSISTANT:' }}
-{% endif %}
--- a/examples/tool_chat_template_deepseekv3.jinja
+++ b/examples/tool_chat_template_deepseekv3.jinja
+{% if not add_generation_prompt is defined %}
+    {% set add_generation_prompt = false %}
+{% endif %}
+{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_last_user=false) %}
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+        {%- if ns.is_first_sp %}
+            {% set ns.system_prompt = ns.system_prompt + message['content'] %}
+            {% set ns.is_first_sp = false %}
+        {%- else %}
+            {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{{ bos_token }}
+{{ ns.system_prompt }}
+{%- if tools %}
+    {{"\n\n# Tools\n\nYou may call one or more functions to assist with the user query." }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{"\n</tools>\n\n"}}
+    {{"For function call returns, you should first print <｜tool▁calls▁begin｜>"}}
+    {{"For each function call, you should return object like:\n" }}
+    {{"<｜tool▁call▁begin｜>function<｜tool▁sep｜><function_name>\n```json\n<function_arguments_in_json_format>\n```<｜tool▁call▁end｜>"}}
+    {{"At the end of function call returns, you should print <｜tool▁calls▁end｜><｜end▁of▁sentence｜>"}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_first = false -%}
+        {%- set ns.is_last_user = true -%}
+        {{'<｜User｜>' + message['content'] + '<｜Assistant｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{'<｜tool▁outputs▁end｜>'}}
+        {%- endif %}
+        {%- set ns.is_first = false %}
+        {%- set ns.is_tool = false -%}
+        {%- set ns.is_output_first = true %}
+        {%- for tool in message['tool_calls'] %}
+            {%- if not ns.is_first %}
+                {%- if message['content'] is none %}
+                    {{'<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- else %}
+                    {{message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+                {%- endif %}
+            {%- set ns.is_first = true -%}
+            {%- else %}
+                {{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<｜tool▁call▁end｜>'}}
+            {%- endif %}
+        {%- endfor %}
+        {{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}
+    {%- endif %}
+    {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none)%}
+        {%- set ns.is_last_user = false -%}
+        {%- if ns.is_tool %}
+            {{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}
+            {%- set ns.is_tool = false -%}
+        {%- else %}
+            {% set content = message['content'] %}
+            {{content + '<｜end▁of▁sentence｜>'}}
+        {%- endif %}
+    {%- endif %}
+    {%- if message['role'] == 'tool' %}
+        {%- set ns.is_last_user = false -%}
+        {%- set ns.is_tool = true -%}
+        {%- if ns.is_output_first %}
+            {{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}
+            {%- set ns.is_output_first = false %}
+        {%- else %}
+            {{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}
+        {%- endif %}
+    {%- endif %}
+{%- endfor -%}
+{% if ns.is_tool %}
+    {{'<｜tool▁outputs▁end｜>'}}
+{% endif %}
+{% if add_generation_prompt and not ns.is_last_user and not ns.is_tool %}
+    {{'<｜Assistant｜>'}}
+{% endif %}
--- a/examples/tool_chat_template_mistral3.jinja
+++ b/examples/tool_chat_template_mistral3.jinja
+{%- set today = strftime_now("%Y-%m-%d") %}
+{%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\")" %}
+{{- bos_token }}
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content'] %}
+        {%- set loop_messages = messages[1:] %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text'] %}
+        {%- set loop_messages = messages[1:] %}
+    {%- endif %}
+{%- else %}
+    {%- set system_message = default_system_message %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- elif tools is not none %}
+    {%- set parallel_tool_prompt = "You are a helpful assistant that can call tools. If you call one or more tools, format them in a single JSON array or objects, where each object is a tool call, not as separate objects outside of an array or multiple arrays. Use the format [{\"name\": tool call name, \"arguments\": tool call arguments}, additional tool calls] if you call more than one tool. If you call tools, do not attempt to interpret them or otherwise provide a response until you receive a tool call result that you can interpret for the user." %}
+    {%- if system_message is defined %}
+        {%- set system_message = parallel_tool_prompt + "\n\n" + system_message %}
+    {%- else %}
+        {%- set system_message = parallel_tool_prompt %}
+    {%- endif %}
+{%- endif %}
+{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}
+{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
+{%- set filtered_messages = [] %}
+{%- for message in loop_messages %}
+    {%- if message["role"] not in ["tool", "tool_results"] and not message.get("tool_calls") %}
+        {%- set filtered_messages = filtered_messages + [message] %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in filtered_messages %}
+    {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %}
+        {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif %}
+{%- endfor %}
+{%- for message in loop_messages %}
+    {%- if message["role"] == "user" %}
+        {%- if tools is not none and (message == user_messages[-1]) %}
+            {{- "[AVAILABLE_TOOLS] [" }}
+            {%- for tool in tools %}
+                {%- set tool = tool.function %}
+                {{- '{"type": "function", "function": {' }}
+                {%- for key, val in tool.items() if key != "return" %}
+                    {%- if val is string %}
+                        {{- '"' + key + '": "' + val + '"' }}
+                    {%- else %}
+                        {{- '"' + key + '": ' + val|tojson }}
+                    {%- endif %}
+                    {%- if not loop.last %}
+                        {{- ", " }}
+                    {%- endif %}
+                {%- endfor %}
+                {{- "}}" }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- else %}
+                    {{- "]" }}
+                {%- endif %}
+            {%- endfor %}
+            {{- "[/AVAILABLE_TOOLS]" }}
+        {%- endif %}
+        {%- if message['content'] is string %}
+        {{- '[INST]' + message['content'] + '[/INST]' }}
+        {%- else %}
+                {{- '[INST]' }}
+                {%- for block in message['content'] %}
+                        {%- if block['type'] == 'text' %}
+                                {{- block['text'] }}
+                        {%- elif block['type'] == 'image' or block['type'] == 'image_url' %}
+                                {{- '[IMG]' }}
+                            {%- else %}
+                                {{- raise_exception('Only text and image blocks are supported in message content!') }}
+                            {%- endif %}
+                    {%- endfor %}
+                {{- '[/INST]' }}
+            {%- endif %}
+    {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %}
+        {%- if message.tool_calls is defined %}
+            {%- set tool_calls = message.tool_calls %}
+        {%- else %}
+            {%- set tool_calls = message.content %}
+        {%- endif %}
+        {{- "[TOOL_CALLS] [" }}
+        {%- for tool_call in tool_calls %}
+            {%- set out = tool_call.function|tojson %}
+            {{- out[:-1] }}
+            {%- if not tool_call.id is defined or tool_call.id|length < 9 %}
+                {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }}
+            {%- endif %}
+            {{- ', "id": "' + tool_call.id[-9:] + '"}' }}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- else %}
+                {{- "]" + eos_token }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif message['role'] == 'assistant' %}
+        {%- if message['content'] is string %}
+            {{- message['content'] + eos_token }}
+        {%- else %}
+            {{- message['content'][0]['text'] + eos_token }}
+        {%- endif %}
+    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
+        {%- if message.content is defined and message.content.content is defined %}
+            {%- set content = message.content.content %}
+        {%- else %}
+            {%- set content = message.content %}
+        {%- endif %}
+        {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
+        {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %}
+            {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }}
+        {%- endif %}
+        {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,10 +3,10 @@
 requires = [
    "cmake>=3.26",
    "ninja",
-    "packaging",
+    "packaging>=24.2",
-    "setuptools>=61",
+    "setuptools>=77.0.3,<80.0.0",
    "setuptools-scm>=8.0",
-    "torch == 2.6.0",
+    "torch == 2.7.0",
    "wheel",
    "jinja2",
 ]
@@ -41,6 +41,9 @@ Slack="http://slack.vllm.ai/"
 [project.scripts]
 vllm = "vllm.entrypoints.cli.main:main"
+[project.entry-points."vllm.general_plugins"]
+lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
 [tool.setuptools_scm]
 # no extra settings needed, presence enables setuptools-scm
@@ -50,6 +53,8 @@ include = ["vllm*"]
 [tool.yapfignore]
 ignore_patterns = [
+    ".buildkite/**",
+    "benchmarks/**",
    "build/**",
 ]
@@ -66,26 +71,15 @@ exclude = [
 "vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
-# Python 3.8 typing. TODO: Remove these excludes after v1.0.0
+# Python 3.8 typing - skip V0 code
-"vllm/adapter_commons/**/*.py" = ["UP006", "UP035"]
 "vllm/attention/**/*.py" = ["UP006", "UP035"]
-"vllm/compilation/**/*.py" = ["UP006", "UP035"]
 "vllm/core/**/*.py" = ["UP006", "UP035"]
-"vllm/device_allocator/**/*.py" = ["UP006", "UP035"]
-"vllm/distributed/**/*.py" = ["UP006", "UP035"]
 "vllm/engine/**/*.py" = ["UP006", "UP035"]
 "vllm/executor/**/*.py" = ["UP006", "UP035"]
-"vllm/lora/**/*.py" = ["UP006", "UP035"]
-"vllm/model_executor/**/*.py" = ["UP006", "UP035"]
-"vllm/platforms/**/*.py" = ["UP006", "UP035"]
-"vllm/plugins/**/*.py" = ["UP006", "UP035"]
-"vllm/profiler/**/*.py" = ["UP006", "UP035"]
 "vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
 "vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
-"vllm/transformers_utils/**/*.py" = ["UP006", "UP035"]
-"vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
-"vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
+# Python 3.8 typing - skip utils for ROCm
 "vllm/utils.py" = ["UP006", "UP035"]
 [tool.ruff.lint]
@@ -102,6 +96,7 @@ select = [
    "SIM",
    # isort
    # "I",
+    # flake8-logging-format
    "G",
 ]
 ignore = [
@@ -150,6 +145,10 @@ ignore-words-list = "dout, te, indicies, subtile, ElementE"
 skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
 [tool.isort]
+skip_glob = [
+    ".buildkite/*",
+    "benchmarks/*",
+]
 use_parentheses = true
 skip_gitignore = true
@@ -158,7 +157,6 @@ markers = [
    "skip_global_cleanup",
    "core_model: enable this model test in each PR instead of only nightly",
    "cpu_model: enable this model test in CPU tests",
-    "quant_model: run this model test under Quantized category",
    "split: run this test as part of a split",
    "distributed: run this test only in distributed GPU tests",
    "skip_v1: do not run this test with v1",
@@ -171,3 +169,9 @@ plugins.md013.enabled = false # line-length
 plugins.md041.enabled = false # first-line-h1
 plugins.md033.enabled = false # inline-html
 plugins.md024.allow_different_nesting = true # no-duplicate-headers
+[tool.ty]
+respect-ignore-files = true
+[tool.ty.environment]
+python = "./.venv"
--- a/requirements/build.txt
+++ b/requirements/build.txt
 # Should be mirrored in pyproject.toml
 cmake>=3.26
 ninja
-packaging
+packaging>=24.2
-setuptools>=61
+setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-torch==2.6.0
+torch==2.7.0
 wheel
 jinja2>=3.1.6
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -19,31 +19,31 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
-llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
+llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64"
+xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
 gguf >= 0.13.0
-importlib_metadata
+importlib_metadata; python_version < '3.10'
 mistral_common[opencv] >= 1.5.4
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
-setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
+setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.9.3 # required for compressed-tensors
+compressed-tensors == 0.9.4 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
-opentelemetry-sdk>=1.26.0,<1.27.0  # vllm.tracing
+opentelemetry-sdk>=1.26.0  # vllm.tracing
-opentelemetry-api>=1.26.0,<1.27.0  # vllm.tracing
+opentelemetry-api>=1.26.0  # vllm.tracing
-opentelemetry-exporter-otlp>=1.26.0,<1.27.0  # vllm.tracing
+opentelemetry-exporter-otlp>=1.26.0  # vllm.tracing
-opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0  # vllm.tracing
+opentelemetry-semantic-conventions-ai>=0.4.1  # vllm.tracing
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -2,18 +2,19 @@
 -r common.txt
 # Dependencies for CPUs
-torch==2.6.0+cpu; platform_machine == "x86_64"
+--extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.6.0; platform_system == "Darwin"
+torch==2.7.0+cpu; platform_machine == "x86_64"
-torch==2.6.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.7.0; platform_system == "Darwin"
+torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 torch==2.7.0.dev20250304; platform_machine == "s390x"
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.6.0; platform_machine == "ppc64le"
+torchaudio==2.7.0; platform_machine == "ppc64le"
 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.21.0; platform_machine == "ppc64le"
+torchvision==0.22.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 # cpu cannot use triton 3.3.0

--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -6,8 +6,9 @@ numba == 0.61.2; python_version > '3.9'
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.6.0
+torch==2.7.0
-torchaudio==2.6.0
+torchaudio==2.7.0
 # These must be updated alongside torch
-torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.6.0
+# https://github.com/facebookresearch/xformers/releases/tag/v0.0.30
+xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
-sphinx==6.2.1
+sphinx==7.4.7
-sphinx-argparse==0.4.0
+sphinx-argparse==0.5.2
-sphinx-book-theme==1.0.1
+sphinx-book-theme==1.1.4
 sphinx-copybutton==0.5.2
 sphinx-design==0.6.1
 sphinx-togglebutton==0.3.2
-myst-parser==3.0.1
+myst-parser==3.0.1  # `myst-parser==4.0.1` breaks inline code in titles
 msgspec
-cloudpickle
+snowballstemmer<3  # https://github.com/snowballstem/snowball/issues/229
 commonmark # Required by sphinx-argparse when using :markdownhelp:
+# Custom autodoc2 is necessary for faster docstring processing
+# see: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33#issuecomment-2856386035
+git+https://github.com/hmellor/sphinx-autodoc2.git # sphinx-autodoc2==0.5.0
 # packages to install to build the documentation
 cachetools
-pydantic >= 2.8
 -f https://download.pytorch.org/whl/cpu
 torch
-py-cpuinfo
\ No newline at end of file
-transformers
-mistral_common >= 1.5.4
-aiohttp
-starlette
-scipy
-openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
-requests
-zmq