Commit 7a985548 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.0' into v0.9.0-ori

parents 45d3785c dc1440cf
...@@ -8,7 +8,7 @@ with the reasoning parser: ...@@ -8,7 +8,7 @@ with the reasoning parser:
```bash ```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1 --reasoning-parser deepseek_r1
``` ```
This example demonstrates how to generate chat completions from reasoning models This example demonstrates how to generate chat completions from reasoning models
......
...@@ -8,7 +8,7 @@ parser: ...@@ -8,7 +8,7 @@ parser:
```bash ```bash
vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--enable-reasoning --reasoning-parser deepseek_r1 --reasoning-parser deepseek_r1
``` ```
Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
......
# SPDX-License-Identifier: Apache-2.0
import argparse
import pprint
import requests
def post_http_request(payload: dict, api_url: str) -> requests.Response:
headers = {"User-Agent": "Test Client"}
response = requests.post(api_url, headers=headers, json=payload)
return response
def parse_args():
parse = argparse.ArgumentParser()
parse.add_argument("--host", type=str, default="localhost")
parse.add_argument("--port", type=int, default=8000)
parse.add_argument("--model",
type=str,
default="jason9693/Qwen2.5-1.5B-apeach")
return parse.parse_args()
def main(args):
host = args.host
port = args.port
model_name = args.model
api_url = f"http://{host}:{port}/classify"
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
payload = {
"model": model_name,
"input": prompts,
}
classify_response = post_http_request(payload=payload, api_url=api_url)
pprint.pprint(classify_response.json())
if __name__ == "__main__":
args = parse_args()
main(args)
...@@ -46,11 +46,15 @@ async def stream_openai_response(): ...@@ -46,11 +46,15 @@ async def stream_openai_response():
"model": "openai/whisper-large-v3", "model": "openai/whisper-large-v3",
} }
url = openai_api_base + "/audio/transcriptions" url = openai_api_base + "/audio/transcriptions"
headers = {"Authorization": f"Bearer {openai_api_key}"}
print("transcription result:", end=' ') print("transcription result:", end=' ')
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
with open(str(winning_call), "rb") as f: with open(str(winning_call), "rb") as f:
async with client.stream('POST', url, files={'file': f}, async with client.stream('POST',
data=data) as response: url,
files={'file': f},
data=data,
headers=headers) as response:
async for line in response.aiter_lines(): async for line in response.aiter_lines():
# Each line is a JSON object prefixed with 'data: ' # Each line is a JSON object prefixed with 'data: '
if line: if line:
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """
Example to deploy DeepSeek R1 or V3 with Ray Serve LLM. Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
See Ray Serve LLM documentation at: See more details at:
https://docs.ray.io/en/latest/serve/tutorials/serve-deepseek.html
And see Ray Serve LLM documentation at:
https://docs.ray.io/en/latest/serve/llm/serving-llms.html https://docs.ray.io/en/latest/serve/llm/serving-llms.html
Run `python3 ray_serve_deepseek.py` to deploy the model. Run `python3 ray_serve_deepseek.py` to deploy the model.
......
# SPDX-License-Identifier: Apache-2.0
"""
Retrieval Augmented Generation (RAG) Implementation with Langchain
==================================================================
This script demonstrates a RAG implementation using LangChain, Milvus
and vLLM. RAG enhances LLM responses by retrieving relevant context
from a document collection.
Features:
- Web content loading and chunking
- Vector storage with Milvus
- Embedding generation with vLLM
- Question answering with context
Prerequisites:
1. Install dependencies:
pip install -U vllm \
langchain_milvus langchain_openai \
langchain_community beautifulsoup4 \
langchain-text-splitters
2. Start services:
# Start embedding service (port 8000)
vllm serve ssmits/Qwen2-7B-Instruct-embed-base
# Start chat service (port 8001)
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
Usage:
python retrieval_augmented_generation_with_langchain.py
Notes:
- Ensure both vLLM services are running before executing
- Default ports: 8000 (embedding), 8001 (chat)
- First run may take time to download models
"""
import argparse
from argparse import Namespace
from typing import Any
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_milvus import Milvus
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
def load_and_split_documents(config: dict[str, Any]):
"""
Load and split documents from web URL
"""
try:
loader = WebBaseLoader(web_paths=(config["url"], ))
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=config["chunk_size"],
chunk_overlap=config["chunk_overlap"],
)
return text_splitter.split_documents(docs)
except Exception as e:
print(f"Error loading document from {config['url']}: {str(e)}")
raise
def init_vectorstore(config: dict[str, Any], documents: list[Document]):
"""
Initialize vector store with documents
"""
return Milvus.from_documents(
documents=documents,
embedding=OpenAIEmbeddings(
model=config["embedding_model"],
openai_api_key=config["vllm_api_key"],
openai_api_base=config["vllm_embedding_endpoint"],
),
connection_args={"uri": config["uri"]},
drop_old=True,
)
def init_llm(config: dict[str, Any]):
"""
Initialize llm
"""
return ChatOpenAI(
model=config["chat_model"],
openai_api_key=config["vllm_api_key"],
openai_api_base=config["vllm_chat_endpoint"],
)
def get_qa_prompt():
"""
Get question answering prompt template
"""
template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use three sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""
return PromptTemplate.from_template(template)
def format_docs(docs: list[Document]):
"""
Format documents for prompt
"""
return "\n\n".join(doc.page_content for doc in docs)
def create_qa_chain(retriever: Any, llm: ChatOpenAI, prompt: PromptTemplate):
"""
Set up question answering chain
"""
return ({
"context": retriever | format_docs,
"question": RunnablePassthrough(),
}
| prompt
| llm
| StrOutputParser())
def get_parser() -> argparse.ArgumentParser:
"""
Parse command line arguments
"""
parser = argparse.ArgumentParser(description='RAG with vLLM and langchain')
# Add command line arguments
parser.add_argument('--vllm-api-key',
default="EMPTY",
help='API key for vLLM compatible services')
parser.add_argument('--vllm-embedding-endpoint',
default="http://localhost:8000/v1",
help='Base URL for embedding service')
parser.add_argument('--vllm-chat-endpoint',
default="http://localhost:8001/v1",
help='Base URL for chat service')
parser.add_argument('--uri',
default="./milvus.db",
help='URI for Milvus database')
parser.add_argument(
'--url',
default=("https://docs.vllm.ai/en/latest/getting_started/"
"quickstart.html"),
help='URL of the document to process')
parser.add_argument('--embedding-model',
default="ssmits/Qwen2-7B-Instruct-embed-base",
help='Model name for embeddings')
parser.add_argument('--chat-model',
default="qwen/Qwen1.5-0.5B-Chat",
help='Model name for chat')
parser.add_argument('-i',
'--interactive',
action='store_true',
help='Enable interactive Q&A mode')
parser.add_argument('-k',
'--top-k',
type=int,
default=3,
help='Number of top results to retrieve')
parser.add_argument('-c',
'--chunk-size',
type=int,
default=1000,
help='Chunk size for document splitting')
parser.add_argument('-o',
'--chunk-overlap',
type=int,
default=200,
help='Chunk overlap for document splitting')
return parser
def init_config(args: Namespace):
"""
Initialize configuration settings from command line arguments
"""
return {
"vllm_api_key": args.vllm_api_key,
"vllm_embedding_endpoint": args.vllm_embedding_endpoint,
"vllm_chat_endpoint": args.vllm_chat_endpoint,
"uri": args.uri,
"embedding_model": args.embedding_model,
"chat_model": args.chat_model,
"url": args.url,
"chunk_size": args.chunk_size,
"chunk_overlap": args.chunk_overlap,
"top_k": args.top_k
}
def main():
# Parse command line arguments
args = get_parser().parse_args()
# Initialize configuration
config = init_config(args)
# Load and split documents
documents = load_and_split_documents(config)
# Initialize vector store and retriever
vectorstore = init_vectorstore(config, documents)
retriever = vectorstore.as_retriever(search_kwargs={"k": config["top_k"]})
# Initialize llm and prompt
llm = init_llm(config)
prompt = get_qa_prompt()
# Set up QA chain
qa_chain = create_qa_chain(retriever, llm, prompt)
# Interactive mode
if args.interactive:
print("\nWelcome to Interactive Q&A System!")
print("Enter 'q' or 'quit' to exit.")
while True:
question = input("\nPlease enter your question: ")
if question.lower() in ['q', 'quit']:
print("\nThank you for using! Goodbye!")
break
output = qa_chain.invoke(question)
print(output)
else:
# Default single question mode
question = ("How to install vLLM?")
output = qa_chain.invoke(question)
print("-" * 50)
print(output)
print("-" * 50)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
"""
RAG (Retrieval Augmented Generation) Implementation with LlamaIndex
================================================================
This script demonstrates a RAG system using:
- LlamaIndex: For document indexing and retrieval
- Milvus: As vector store backend
- vLLM: For embedding and text generation
Features:
1. Document Loading & Processing
2. Embedding & Storage
3. Query Processing
Requirements:
1. Install dependencies:
pip install llama-index llama-index-readers-web \
llama-index-llms-openai-like \
llama-index-embeddings-openai-like \
llama-index-vector-stores-milvus \
2. Start services:
# Start embedding service (port 8000)
vllm serve ssmits/Qwen2-7B-Instruct-embed-base
# Start chat service (port 8001)
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
Usage:
python retrieval_augmented_generation_with_llamaindex.py
Notes:
- Ensure both vLLM services are running before executing
- Default ports: 8000 (embedding), 8001 (chat)
- First run may take time to download models
"""
import argparse
from argparse import Namespace
from typing import Any
from llama_index.core import Settings, StorageContext, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai_like import OpenAILikeEmbedding
from llama_index.llms.openai_like import OpenAILike
from llama_index.readers.web import SimpleWebPageReader
from llama_index.vector_stores.milvus import MilvusVectorStore
def init_config(args: Namespace):
"""Initialize configuration with command line arguments"""
return {
"url": args.url,
"embedding_model": args.embedding_model,
"chat_model": args.chat_model,
"vllm_api_key": args.vllm_api_key,
"embedding_endpoint": args.embedding_endpoint,
"chat_endpoint": args.chat_endpoint,
"db_path": args.db_path,
"chunk_size": args.chunk_size,
"chunk_overlap": args.chunk_overlap,
"top_k": args.top_k
}
def load_documents(url: str) -> list:
"""Load and process web documents"""
return SimpleWebPageReader(html_to_text=True).load_data([url])
def setup_models(config: dict[str, Any]):
"""Configure embedding and chat models"""
Settings.embed_model = OpenAILikeEmbedding(
api_base=config["embedding_endpoint"],
api_key=config["vllm_api_key"],
model_name=config["embedding_model"],
)
Settings.llm = OpenAILike(
model=config["chat_model"],
api_key=config["vllm_api_key"],
api_base=config["chat_endpoint"],
context_window=128000,
is_chat_model=True,
is_function_calling_model=False,
)
Settings.transformations = [
SentenceSplitter(
chunk_size=config["chunk_size"],
chunk_overlap=config["chunk_overlap"],
)
]
def setup_vector_store(db_path: str) -> MilvusVectorStore:
"""Initialize vector store"""
sample_emb = Settings.embed_model.get_text_embedding("test")
print(f"Embedding dimension: {len(sample_emb)}")
return MilvusVectorStore(uri=db_path, dim=len(sample_emb), overwrite=True)
def create_index(documents: list, vector_store: MilvusVectorStore):
"""Create document index"""
storage_context = StorageContext.from_defaults(vector_store=vector_store)
return VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
)
def query_document(index: VectorStoreIndex, question: str, top_k: int):
"""Query document with given question"""
query_engine = index.as_query_engine(similarity_top_k=top_k)
return query_engine.query(question)
def get_parser() -> argparse.ArgumentParser:
"""Parse command line arguments"""
parser = argparse.ArgumentParser(
description='RAG with vLLM and LlamaIndex')
# Add command line arguments
parser.add_argument(
'--url',
default=("https://docs.vllm.ai/en/latest/getting_started/"
"quickstart.html"),
help='URL of the document to process')
parser.add_argument('--embedding-model',
default="ssmits/Qwen2-7B-Instruct-embed-base",
help='Model name for embeddings')
parser.add_argument('--chat-model',
default="qwen/Qwen1.5-0.5B-Chat",
help='Model name for chat')
parser.add_argument('--vllm-api-key',
default="EMPTY",
help='API key for vLLM compatible services')
parser.add_argument('--embedding-endpoint',
default="http://localhost:8000/v1",
help='Base URL for embedding service')
parser.add_argument('--chat-endpoint',
default="http://localhost:8001/v1",
help='Base URL for chat service')
parser.add_argument('--db-path',
default="./milvus_demo.db",
help='Path to Milvus database')
parser.add_argument('-i',
'--interactive',
action='store_true',
help='Enable interactive Q&A mode')
parser.add_argument('-c',
'--chunk-size',
type=int,
default=1000,
help='Chunk size for document splitting')
parser.add_argument('-o',
'--chunk-overlap',
type=int,
default=200,
help='Chunk overlap for document splitting')
parser.add_argument('-k',
'--top-k',
type=int,
default=3,
help='Number of top results to retrieve')
return parser
def main():
# Parse command line arguments
args = get_parser().parse_args()
# Initialize configuration
config = init_config(args)
# Load documents
documents = load_documents(config["url"])
# Setup models
setup_models(config)
# Setup vector store
vector_store = setup_vector_store(config["db_path"])
# Create index
index = create_index(documents, vector_store)
if args.interactive:
print("\nEntering interactive mode. Type 'quit' to exit.")
while True:
# Get user question
question = input("\nEnter your question: ")
# Check for exit command
if question.lower() in ['quit', 'exit', 'q']:
print("Exiting interactive mode...")
break
# Get and print response
print("\n" + "-" * 50)
print("Response:\n")
response = query_document(index, question, config["top_k"])
print(response)
print("-" * 50)
else:
# Single query mode
question = "How to install vLLM?"
response = query_document(index, question, config["top_k"])
print("-" * 50)
print("Response:\n")
print(response)
print("-" * 50)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
"""
vLLM Chat Assistant - A Streamlit Web Interface
A streamlined chat interface that quickly integrates
with vLLM API server.
Features:
- Multiple chat sessions management
- Streaming response display
- Configurable API endpoint
- Real-time chat history
Requirements:
pip install streamlit openai
Usage:
# Start the app with default settings
streamlit run streamlit_openai_chatbot_webserver.py
# Start with custom vLLM API endpoint
VLLM_API_BASE="http://your-server:8000/v1" \
streamlit run streamlit_openai_chatbot_webserver.py
# Enable debug mode
streamlit run streamlit_openai_chatbot_webserver.py \
--logger.level=debug
"""
import os
from datetime import datetime
import streamlit as st
from openai import OpenAI
# Get command line arguments from environment variables
openai_api_key = os.getenv('VLLM_API_KEY', "EMPTY")
openai_api_base = os.getenv('VLLM_API_BASE', "http://localhost:8000/v1")
# Initialize session states for managing chat sessions
if "sessions" not in st.session_state:
st.session_state.sessions = {}
if "current_session" not in st.session_state:
st.session_state.current_session = None
if "messages" not in st.session_state:
st.session_state.messages = []
if "active_session" not in st.session_state:
st.session_state.active_session = None
# Initialize session state for API base URL
if "api_base_url" not in st.session_state:
st.session_state.api_base_url = openai_api_base
def create_new_chat_session():
"""Create a new chat session with timestamp as ID"""
session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
st.session_state.sessions[session_id] = []
st.session_state.current_session = session_id
st.session_state.active_session = session_id
st.session_state.messages = []
def switch_to_chat_session(session_id):
"""Switch to a different chat session"""
st.session_state.current_session = session_id
st.session_state.active_session = session_id
st.session_state.messages = st.session_state.sessions[session_id]
def get_llm_response(messages, model):
"""Get streaming response from llm
Args:
messages: List of message dictionaries
model: Name of model
Returns:
Streaming response object or error message string
"""
try:
response = client.chat.completions.create(model=model,
messages=messages,
stream=True)
return response
except Exception as e:
st.error(f"Error details: {str(e)}")
return f"Error: {str(e)}"
# Sidebar - API Settings first
st.sidebar.title("API Settings")
new_api_base = st.sidebar.text_input("API Base URL:",
value=st.session_state.api_base_url)
if new_api_base != st.session_state.api_base_url:
st.session_state.api_base_url = new_api_base
st.rerun()
st.sidebar.divider()
# Sidebar - Session Management
st.sidebar.title("Chat Sessions")
if st.sidebar.button("New Session"):
create_new_chat_session()
# Display all sessions in reverse chronological order
for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
# Mark the active session with a pinned button
if session_id == st.session_state.active_session:
st.sidebar.button(f"📍 {session_id}",
key=session_id,
type="primary",
on_click=switch_to_chat_session,
args=(session_id, ))
else:
st.sidebar.button(f"Session {session_id}",
key=session_id,
on_click=switch_to_chat_session,
args=(session_id, ))
# Main interface
st.title("vLLM Chat Assistant")
# Initialize OpenAI client with API settings
client = OpenAI(api_key=openai_api_key, base_url=st.session_state.api_base_url)
# Get and display current model id
models = client.models.list()
model = models.data[0].id
st.markdown(f"**Model**: {model}")
# Initialize first session if none exists
if st.session_state.current_session is None:
create_new_chat_session()
st.session_state.active_session = st.session_state.current_session
# Display chat history for current session
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.write(message["content"])
# Handle user input and generate llm response
if prompt := st.chat_input("Type your message here..."):
# Save user message to session
st.session_state.messages.append({"role": "user", "content": prompt})
st.session_state.sessions[
st.session_state.current_session] = st.session_state.messages
# Display user message
with st.chat_message("user"):
st.write(prompt)
# Prepare messages for llm
messages_for_llm = [{
"role": m["role"],
"content": m["content"]
} for m in st.session_state.messages]
# Generate and display llm response
with st.chat_message("assistant"):
message_placeholder = st.empty()
full_response = ""
# Get streaming response from llm
response = get_llm_response(messages_for_llm, model)
if isinstance(response, str):
message_placeholder.markdown(response)
full_response = response
else:
for chunk in response:
if hasattr(chunk.choices[0].delta, "content"):
content = chunk.choices[0].delta.content
if content:
full_response += content
message_placeholder.markdown(full_response + "▌")
message_placeholder.markdown(full_response)
# Save llm response to session history
st.session_state.messages.append({
"role": "assistant",
"content": full_response
})
# SPDX-License-Identifier: Apache-2.0
from openai import APIConnectionError, OpenAI
from openai.pagination import SyncPage
from openai.types.model import Model
def get_first_model(client: OpenAI) -> str:
"""
Get the first model from the vLLM server.
"""
try:
models: SyncPage[Model] = client.models.list()
except APIConnectionError as e:
raise RuntimeError(
"Failed to get the list of models from the vLLM server at "
f"{client.base_url} with API key {client.api_key}. Check\n"
"1. the server is running\n"
"2. the server URL is correct\n"
"3. the API key is correct") from e
if len(models.data) == 0:
raise RuntimeError(
f"No models found on the vLLM server at {client.base_url}")
return models.data[0].id
{%- for message in messages -%}
{%- if message['role'] == 'user' -%}
{{- message['content'] -}}
{%- elif message['role'] == 'assistant' -%}
{{- message['content'] -}}
{%- endif -%}
{%- endfor -%}
{%- if messages[0]['role'] == 'system' -%}
{%- set system_message = messages[0]['content'] -%}
{%- set messages = messages[1:] -%}
{%- else -%}
{% set system_message = '' -%}
{%- endif -%}
{{ bos_token + system_message }}
{%- for message in messages -%}
{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
{%- endif -%}
{%- if message['role'] == 'user' -%}
{{ 'USER: ' + message['content'] + '\n' }}
{%- elif message['role'] == 'assistant' -%}
{{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{ 'ASSISTANT:' }}
{% endif %}
{% if not add_generation_prompt is defined %}
{% set add_generation_prompt = false %}
{% endif %}
{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_last_user=false) %}
{%- for message in messages %}
{%- if message['role'] == 'system' %}
{%- if ns.is_first_sp %}
{% set ns.system_prompt = ns.system_prompt + message['content'] %}
{% set ns.is_first_sp = false %}
{%- else %}
{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
{%- endif %}
{%- endif %}
{%- endfor %}
{{ bos_token }}
{{ ns.system_prompt }}
{%- if tools %}
{{"\n\n# Tools\n\nYou may call one or more functions to assist with the user query." }}
{%- for tool in tools %}
{{- "\n" }}
{{- tool | tojson }}
{%- endfor %}
{{"\n</tools>\n\n"}}
{{"For function call returns, you should first print <|tool▁calls▁begin|>"}}
{{"For each function call, you should return object like:\n" }}
{{"<|tool▁call▁begin|>function<|tool▁sep|><function_name>\n```json\n<function_arguments_in_json_format>\n```<|tool▁call▁end|>"}}
{{"At the end of function call returns, you should print <|tool▁calls▁end|><|end▁of▁sentence|>"}}
{%- endif %}
{%- for message in messages %}
{%- if message['role'] == 'user' %}
{%- set ns.is_tool = false -%}
{%- set ns.is_first = false -%}
{%- set ns.is_last_user = true -%}
{{'<|User|>' + message['content'] + '<|Assistant|>'}}
{%- endif %}
{%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
{%- set ns.is_last_user = false -%}
{%- if ns.is_tool %}
{{'<|tool▁outputs▁end|>'}}
{%- endif %}
{%- set ns.is_first = false %}
{%- set ns.is_tool = false -%}
{%- set ns.is_output_first = true %}
{%- for tool in message['tool_calls'] %}
{%- if not ns.is_first %}
{%- if message['content'] is none %}
{{'<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<|tool▁call▁end|>'}}
{%- else %}
{{message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<|tool▁call▁end|>'}}
{%- endif %}
{%- set ns.is_first = true -%}
{%- else %}
{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<|tool▁call▁end|>'}}
{%- endif %}
{%- endfor %}
{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}
{%- endif %}
{%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none)%}
{%- set ns.is_last_user = false -%}
{%- if ns.is_tool %}
{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}
{%- set ns.is_tool = false -%}
{%- else %}
{% set content = message['content'] %}
{{content + '<|end▁of▁sentence|>'}}
{%- endif %}
{%- endif %}
{%- if message['role'] == 'tool' %}
{%- set ns.is_last_user = false -%}
{%- set ns.is_tool = true -%}
{%- if ns.is_output_first %}
{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}
{%- set ns.is_output_first = false %}
{%- else %}
{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}
{%- endif %}
{%- endif %}
{%- endfor -%}
{% if ns.is_tool %}
{{'<|tool▁outputs▁end|>'}}
{% endif %}
{% if add_generation_prompt and not ns.is_last_user and not ns.is_tool %}
{{'<|Assistant|>'}}
{% endif %}
{%- set today = strftime_now("%Y-%m-%d") %}
{%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\")" %}
{{- bos_token }}
{%- if messages[0]['role'] == 'system' %}
{%- if messages[0]['content'] is string %}
{%- set system_message = messages[0]['content'] %}
{%- set loop_messages = messages[1:] %}
{%- else %}
{%- set system_message = messages[0]['content'][0]['text'] %}
{%- set loop_messages = messages[1:] %}
{%- endif %}
{%- else %}
{%- set system_message = default_system_message %}
{%- set loop_messages = messages %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- elif tools is not none %}
{%- set parallel_tool_prompt = "You are a helpful assistant that can call tools. If you call one or more tools, format them in a single JSON array or objects, where each object is a tool call, not as separate objects outside of an array or multiple arrays. Use the format [{\"name\": tool call name, \"arguments\": tool call arguments}, additional tool calls] if you call more than one tool. If you call tools, do not attempt to interpret them or otherwise provide a response until you receive a tool call result that you can interpret for the user." %}
{%- if system_message is defined %}
{%- set system_message = parallel_tool_prompt + "\n\n" + system_message %}
{%- else %}
{%- set system_message = parallel_tool_prompt %}
{%- endif %}
{%- endif %}
{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}
{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
{%- set filtered_messages = [] %}
{%- for message in loop_messages %}
{%- if message["role"] not in ["tool", "tool_results"] and not message.get("tool_calls") %}
{%- set filtered_messages = filtered_messages + [message] %}
{%- endif %}
{%- endfor %}
{%- for message in filtered_messages %}
{%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %}
{{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
{%- endif %}
{%- endfor %}
{%- for message in loop_messages %}
{%- if message["role"] == "user" %}
{%- if tools is not none and (message == user_messages[-1]) %}
{{- "[AVAILABLE_TOOLS] [" }}
{%- for tool in tools %}
{%- set tool = tool.function %}
{{- '{"type": "function", "function": {' }}
{%- for key, val in tool.items() if key != "return" %}
{%- if val is string %}
{{- '"' + key + '": "' + val + '"' }}
{%- else %}
{{- '"' + key + '": ' + val|tojson }}
{%- endif %}
{%- if not loop.last %}
{{- ", " }}
{%- endif %}
{%- endfor %}
{{- "}}" }}
{%- if not loop.last %}
{{- ", " }}
{%- else %}
{{- "]" }}
{%- endif %}
{%- endfor %}
{{- "[/AVAILABLE_TOOLS]" }}
{%- endif %}
{%- if message['content'] is string %}
{{- '[INST]' + message['content'] + '[/INST]' }}
{%- else %}
{{- '[INST]' }}
{%- for block in message['content'] %}
{%- if block['type'] == 'text' %}
{{- block['text'] }}
{%- elif block['type'] == 'image' or block['type'] == 'image_url' %}
{{- '[IMG]' }}
{%- else %}
{{- raise_exception('Only text and image blocks are supported in message content!') }}
{%- endif %}
{%- endfor %}
{{- '[/INST]' }}
{%- endif %}
{%- elif message["role"] == "tool_calls" or message.tool_calls is defined %}
{%- if message.tool_calls is defined %}
{%- set tool_calls = message.tool_calls %}
{%- else %}
{%- set tool_calls = message.content %}
{%- endif %}
{{- "[TOOL_CALLS] [" }}
{%- for tool_call in tool_calls %}
{%- set out = tool_call.function|tojson %}
{{- out[:-1] }}
{%- if not tool_call.id is defined or tool_call.id|length < 9 %}
{{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }}
{%- endif %}
{{- ', "id": "' + tool_call.id[-9:] + '"}' }}
{%- if not loop.last %}
{{- ", " }}
{%- else %}
{{- "]" + eos_token }}
{%- endif %}
{%- endfor %}
{%- elif message['role'] == 'assistant' %}
{%- if message['content'] is string %}
{{- message['content'] + eos_token }}
{%- else %}
{{- message['content'][0]['text'] + eos_token }}
{%- endif %}
{%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
{%- if message.content is defined and message.content.content is defined %}
{%- set content = message.content.content %}
{%- else %}
{%- set content = message.content %}
{%- endif %}
{{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
{%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %}
{{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }}
{%- endif %}
{{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }}
{%- else %}
{{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
{%- endif %}
{%- endfor %}
...@@ -3,10 +3,10 @@ ...@@ -3,10 +3,10 @@
requires = [ requires = [
"cmake>=3.26", "cmake>=3.26",
"ninja", "ninja",
"packaging", "packaging>=24.2",
"setuptools>=61", "setuptools>=77.0.3,<80.0.0",
"setuptools-scm>=8.0", "setuptools-scm>=8.0",
"torch == 2.6.0", "torch == 2.7.0",
"wheel", "wheel",
"jinja2", "jinja2",
] ]
...@@ -41,6 +41,9 @@ Slack="http://slack.vllm.ai/" ...@@ -41,6 +41,9 @@ Slack="http://slack.vllm.ai/"
[project.scripts] [project.scripts]
vllm = "vllm.entrypoints.cli.main:main" vllm = "vllm.entrypoints.cli.main:main"
[project.entry-points."vllm.general_plugins"]
lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
[tool.setuptools_scm] [tool.setuptools_scm]
# no extra settings needed, presence enables setuptools-scm # no extra settings needed, presence enables setuptools-scm
...@@ -50,6 +53,8 @@ include = ["vllm*"] ...@@ -50,6 +53,8 @@ include = ["vllm*"]
[tool.yapfignore] [tool.yapfignore]
ignore_patterns = [ ignore_patterns = [
".buildkite/**",
"benchmarks/**",
"build/**", "build/**",
] ]
...@@ -66,26 +71,15 @@ exclude = [ ...@@ -66,26 +71,15 @@ exclude = [
"vllm/third_party/**" = ["ALL"] "vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"] "vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"] "vllm/_version.py" = ["ALL"]
# Python 3.8 typing. TODO: Remove these excludes after v1.0.0 # Python 3.8 typing - skip V0 code
"vllm/adapter_commons/**/*.py" = ["UP006", "UP035"]
"vllm/attention/**/*.py" = ["UP006", "UP035"] "vllm/attention/**/*.py" = ["UP006", "UP035"]
"vllm/compilation/**/*.py" = ["UP006", "UP035"]
"vllm/core/**/*.py" = ["UP006", "UP035"] "vllm/core/**/*.py" = ["UP006", "UP035"]
"vllm/device_allocator/**/*.py" = ["UP006", "UP035"]
"vllm/distributed/**/*.py" = ["UP006", "UP035"]
"vllm/engine/**/*.py" = ["UP006", "UP035"] "vllm/engine/**/*.py" = ["UP006", "UP035"]
"vllm/executor/**/*.py" = ["UP006", "UP035"] "vllm/executor/**/*.py" = ["UP006", "UP035"]
"vllm/lora/**/*.py" = ["UP006", "UP035"]
"vllm/model_executor/**/*.py" = ["UP006", "UP035"]
"vllm/platforms/**/*.py" = ["UP006", "UP035"]
"vllm/plugins/**/*.py" = ["UP006", "UP035"]
"vllm/profiler/**/*.py" = ["UP006", "UP035"]
"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"] "vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
"vllm/spec_decode/**/*.py" = ["UP006", "UP035"] "vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
"vllm/transformers_utils/**/*.py" = ["UP006", "UP035"]
"vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
"vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
"vllm/worker/**/*.py" = ["UP006", "UP035"] "vllm/worker/**/*.py" = ["UP006", "UP035"]
# Python 3.8 typing - skip utils for ROCm
"vllm/utils.py" = ["UP006", "UP035"] "vllm/utils.py" = ["UP006", "UP035"]
[tool.ruff.lint] [tool.ruff.lint]
...@@ -102,6 +96,7 @@ select = [ ...@@ -102,6 +96,7 @@ select = [
"SIM", "SIM",
# isort # isort
# "I", # "I",
# flake8-logging-format
"G", "G",
] ]
ignore = [ ignore = [
...@@ -150,6 +145,10 @@ ignore-words-list = "dout, te, indicies, subtile, ElementE" ...@@ -150,6 +145,10 @@ ignore-words-list = "dout, te, indicies, subtile, ElementE"
skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*" skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
[tool.isort] [tool.isort]
skip_glob = [
".buildkite/*",
"benchmarks/*",
]
use_parentheses = true use_parentheses = true
skip_gitignore = true skip_gitignore = true
...@@ -158,7 +157,6 @@ markers = [ ...@@ -158,7 +157,6 @@ markers = [
"skip_global_cleanup", "skip_global_cleanup",
"core_model: enable this model test in each PR instead of only nightly", "core_model: enable this model test in each PR instead of only nightly",
"cpu_model: enable this model test in CPU tests", "cpu_model: enable this model test in CPU tests",
"quant_model: run this model test under Quantized category",
"split: run this test as part of a split", "split: run this test as part of a split",
"distributed: run this test only in distributed GPU tests", "distributed: run this test only in distributed GPU tests",
"skip_v1: do not run this test with v1", "skip_v1: do not run this test with v1",
...@@ -171,3 +169,9 @@ plugins.md013.enabled = false # line-length ...@@ -171,3 +169,9 @@ plugins.md013.enabled = false # line-length
plugins.md041.enabled = false # first-line-h1 plugins.md041.enabled = false # first-line-h1
plugins.md033.enabled = false # inline-html plugins.md033.enabled = false # inline-html
plugins.md024.allow_different_nesting = true # no-duplicate-headers plugins.md024.allow_different_nesting = true # no-duplicate-headers
[tool.ty]
respect-ignore-files = true
[tool.ty.environment]
python = "./.venv"
# Should be mirrored in pyproject.toml # Should be mirrored in pyproject.toml
cmake>=3.26 cmake>=3.26
ninja ninja
packaging packaging>=24.2
setuptools>=61 setuptools>=77.0.3,<80.0.0
setuptools-scm>=8 setuptools-scm>=8
torch==2.6.0 torch==2.7.0
wheel wheel
jinja2>=3.1.6 jinja2>=3.1.6
...@@ -19,31 +19,31 @@ pillow # Required for image processing ...@@ -19,31 +19,31 @@ pillow # Required for image processing
prometheus-fastapi-instrumentator >= 7.0.0 prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer >= 0.10.11, < 0.11 lm-format-enforcer >= 0.10.11, < 0.11
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
outlines == 0.1.11 outlines == 0.1.11
lark == 1.2.2 lark == 1.2.2
xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64" xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64"
typing_extensions >= 4.10 typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs partial-json-parser # used for parsing partial JSON outputs
pyzmq >= 25.0.0 pyzmq >= 25.0.0
msgspec msgspec
gguf >= 0.13.0 gguf >= 0.13.0
importlib_metadata importlib_metadata; python_version < '3.10'
mistral_common[opencv] >= 1.5.4 mistral_common[opencv] >= 1.5.4
opencv-python-headless >= 4.11.0 # required for video IO opencv-python-headless >= 4.11.0 # required for video IO
pyyaml pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL. einops # Required for Qwen2-VL.
compressed-tensors == 0.9.3 # required for compressed-tensors compressed-tensors == 0.9.4 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging with compilation config depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/other/logging_configuration.md python-json-logger # Used by logging as per examples/other/logging_configuration.md
scipy # Required for phi-4-multimodal-instruct scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu ninja # Required for xgrammar, rocm, tpu, xpu
opentelemetry-sdk>=1.26.0,<1.27.0 # vllm.tracing opentelemetry-sdk>=1.26.0 # vllm.tracing
opentelemetry-api>=1.26.0,<1.27.0 # vllm.tracing opentelemetry-api>=1.26.0 # vllm.tracing
opentelemetry-exporter-otlp>=1.26.0,<1.27.0 # vllm.tracing opentelemetry-exporter-otlp>=1.26.0 # vllm.tracing
opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0 # vllm.tracing opentelemetry-semantic-conventions-ai>=0.4.1 # vllm.tracing
...@@ -2,18 +2,19 @@ ...@@ -2,18 +2,19 @@
-r common.txt -r common.txt
# Dependencies for CPUs # Dependencies for CPUs
torch==2.6.0+cpu; platform_machine == "x86_64" --extra-index-url https://download.pytorch.org/whl/cpu
torch==2.6.0; platform_system == "Darwin" torch==2.7.0+cpu; platform_machine == "x86_64"
torch==2.6.0; platform_machine == "ppc64le" or platform_machine == "aarch64" torch==2.7.0; platform_system == "Darwin"
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
torch==2.7.0.dev20250304; platform_machine == "s390x" torch==2.7.0.dev20250304; platform_machine == "s390x"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
torchaudio==2.6.0; platform_machine == "ppc64le" torchaudio==2.7.0; platform_machine == "ppc64le"
# required for the image processor of phi3v, this must be updated alongside torch # required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x" torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
torchvision==0.21.0; platform_machine == "ppc64le" torchvision==0.22.0; platform_machine == "ppc64le"
datasets # for benchmark scripts datasets # for benchmark scripts
# cpu cannot use triton 3.3.0 # cpu cannot use triton 3.3.0
......
...@@ -6,8 +6,9 @@ numba == 0.61.2; python_version > '3.9' ...@@ -6,8 +6,9 @@ numba == 0.61.2; python_version > '3.9'
# Dependencies for NVIDIA GPUs # Dependencies for NVIDIA GPUs
ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1. ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
torch==2.6.0 torch==2.7.0
torchaudio==2.6.0 torchaudio==2.7.0
# These must be updated alongside torch # These must be updated alongside torch
torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.6.0 # https://github.com/facebookresearch/xformers/releases/tag/v0.0.30
xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
sphinx==6.2.1 sphinx==7.4.7
sphinx-argparse==0.4.0 sphinx-argparse==0.5.2
sphinx-book-theme==1.0.1 sphinx-book-theme==1.1.4
sphinx-copybutton==0.5.2 sphinx-copybutton==0.5.2
sphinx-design==0.6.1 sphinx-design==0.6.1
sphinx-togglebutton==0.3.2 sphinx-togglebutton==0.3.2
myst-parser==3.0.1 myst-parser==3.0.1 # `myst-parser==4.0.1` breaks inline code in titles
msgspec msgspec
cloudpickle snowballstemmer<3 # https://github.com/snowballstem/snowball/issues/229
commonmark # Required by sphinx-argparse when using :markdownhelp: commonmark # Required by sphinx-argparse when using :markdownhelp:
# Custom autodoc2 is necessary for faster docstring processing
# see: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33#issuecomment-2856386035
git+https://github.com/hmellor/sphinx-autodoc2.git # sphinx-autodoc2==0.5.0
# packages to install to build the documentation # packages to install to build the documentation
cachetools cachetools
pydantic >= 2.8
-f https://download.pytorch.org/whl/cpu -f https://download.pytorch.org/whl/cpu
torch torch
py-cpuinfo \ No newline at end of file
transformers
mistral_common >= 1.5.4
aiohttp
starlette
scipy
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
requests
zmq
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment