Commit 4eabe123 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

parents 45840cd2 58738772
......@@ -20,6 +20,7 @@ Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
Learn more about LMCache environment setup, please refer to:
https://docs.lmcache.ai/getting_started/installation.html
"""
import argparse
import contextlib
import os
......@@ -49,8 +50,7 @@ def setup_environment_variables(vllm_version: str):
@contextlib.contextmanager
def build_llm_with_lmcache(lmcache_connector: str, model: str,
vllm_version: str):
def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str):
ktc = KVTransferConfig(
kv_connector=lmcache_connector,
kv_role="kv_both",
......@@ -97,18 +97,19 @@ def print_output(
for output in outputs:
generated_text = output.outputs[0].text
print(f"Generated text: {generated_text!r}")
print(f"Generation took {time.time() - start:.2f} seconds, "
f"{req_str} request done.")
print(f"Generation took {time.time() - start:.2f} seconds, {req_str} request done.")
print("-" * 50)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-v",
parser.add_argument(
"-v",
"--version",
choices=["v0", "v1"],
default="v1",
help="Specify vLLM version (default: v1)")
help="Specify vLLM version (default: v1)",
)
return parser.parse_args()
......@@ -125,7 +126,6 @@ def main():
setup_environment_variables(args.version)
with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
# This example script runs two requests with a shared prefix.
# Define the shared prompt and specific prompts
shared_prompt = "Hello, how are you?" * 1000
......@@ -136,9 +136,7 @@ def main():
shared_prompt + "Tell me a very long story",
]
sampling_params = SamplingParams(temperature=0,
top_p=0.95,
max_tokens=10)
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
# Print the first output
print_output(llm, first_prompt, sampling_params, "first")
......
......@@ -10,6 +10,7 @@ vLLM prefill node -> LMCache server -> vLLM decode node.
Note that `pip install lmcache` is needed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache.
"""
import os
import subprocess
import time
......@@ -49,19 +50,23 @@ def run_prefill(prefill_done, prompts):
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
ktc = KVTransferConfig(kv_connector="LMCacheConnector",
ktc = KVTransferConfig(
kv_connector="LMCacheConnector",
kv_role="kv_producer",
kv_rank=0,
kv_parallel_size=2)
kv_parallel_size=2,
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
llm = LLM(
model="mistralai/Mistral-7B-Instruct-v0.2",
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8,
enforce_eager=True)
enforce_eager=True,
)
#llm.generate(prompts, sampling_params)
# llm.generate(prompts, sampling_params)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
generated_text = output.outputs[0].text
......@@ -79,17 +84,21 @@ def run_decode(prefill_done, prompts, timeout=1):
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
ktc = KVTransferConfig(kv_connector="LMCacheConnector",
ktc = KVTransferConfig(
kv_connector="LMCacheConnector",
kv_role="kv_consumer",
kv_rank=1,
kv_parallel_size=2)
kv_parallel_size=2,
)
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# of memory. Reduce the value if your GPU has less memory.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
llm = LLM(
model="mistralai/Mistral-7B-Instruct-v0.2",
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8,
enforce_eager=True)
enforce_eager=True,
)
print("Waiting for prefill node to finish...")
prefill_done.wait()
......@@ -105,10 +114,9 @@ def run_decode(prefill_done, prompts, timeout=1):
def run_lmcache_server(port):
server_proc = subprocess.Popen([
"python", "-m", "lmcache.experimental.server", "localhost",
str(port)
])
server_proc = subprocess.Popen(
["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
)
return server_proc
......
......@@ -17,13 +17,17 @@ async def lifespan(app: FastAPI):
Lifespan context manager to handle startup and shutdown events.
"""
# Startup: Initialize clients
prefiller_base_url = f'http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1'
decoder_base_url = f'http://{global_args.decoder_host}:{global_args.decoder_port}/v1'
app.state.prefill_client = httpx.AsyncClient(timeout=None,
base_url=prefiller_base_url)
app.state.decode_client = httpx.AsyncClient(timeout=None,
base_url=decoder_base_url)
prefiller_base_url = (
f"http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1"
)
decoder_base_url = (
f"http://{global_args.decoder_host}:{global_args.decoder_port}/v1"
)
app.state.prefill_client = httpx.AsyncClient(
timeout=None, base_url=prefiller_base_url
)
app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url)
yield
......@@ -37,7 +41,6 @@ app = FastAPI(lifespan=lifespan)
class StatsCalculator:
def __init__(self):
self._stats = []
self._last_log_time = time.time()
......@@ -51,13 +54,18 @@ class StatsCalculator:
def _log_stats(self):
# Print average, median, and 99th percentile
np_arr = np.array(self._stats)
output_str = f"\nNum requests: {len(self._stats)}" + \
"\nPrefill node TTFT stats:" + \
f"\n - Average (ms): {np.mean(np_arr)}" + \
f"\n - Median (ms): {np.median(np_arr)}" + \
f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n"
print("===============================", output_str,
"===============================")
output_str = (
f"\nNum requests: {len(self._stats)}"
+ "\nPrefill node TTFT stats:"
+ f"\n - Average (ms): {np.mean(np_arr)}"
+ f"\n - Median (ms): {np.median(np_arr)}"
+ f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n"
)
print(
"===============================",
output_str,
"===============================",
)
stats_calculator = StatsCalculator()
......@@ -82,15 +90,16 @@ app.state.prefill_client = None
app.state.decode_client = None
async def send_request_to_service(client: httpx.AsyncClient, endpoint: str,
req_data: dict):
async def send_request_to_service(
client: httpx.AsyncClient, endpoint: str, req_data: dict
):
"""
Send a request to a service using a persistent client.
"""
req_data = req_data.copy()
req_data['max_tokens'] = 1
if 'max_completion_tokens' in req_data:
req_data['max_completion_tokens'] = 1
req_data["max_tokens"] = 1
if "max_completion_tokens" in req_data:
req_data["max_completion_tokens"] = 1
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
response = await client.post(endpoint, json=req_data, headers=headers)
......@@ -98,14 +107,16 @@ async def send_request_to_service(client: httpx.AsyncClient, endpoint: str,
return response
async def stream_service_response(client: httpx.AsyncClient, endpoint: str,
req_data: dict):
async def stream_service_response(
client: httpx.AsyncClient, endpoint: str, req_data: dict
):
"""
Asynchronously stream the response from a service using a persistent client.
"""
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
async with client.stream("POST", endpoint, json=req_data,
headers=headers) as response:
async with client.stream(
"POST", endpoint, json=req_data, headers=headers
) as response:
response.raise_for_status()
async for chunk in response.aiter_bytes():
yield chunk
......@@ -121,28 +132,28 @@ async def handle_completions(request: Request):
req_data = await request.json()
# Send request to prefill service, ignore the response
await send_request_to_service(app.state.prefill_client, "/completions",
req_data)
await send_request_to_service(
app.state.prefill_client, "/completions", req_data
)
et = time.time()
stats_calculator.add(et - st)
# Stream response from decode service
async def generate_stream():
async for chunk in stream_service_response(app.state.decode_client,
"/completions",
req_data):
async for chunk in stream_service_response(
app.state.decode_client, "/completions", req_data
):
yield chunk
return StreamingResponse(generate_stream(),
media_type="application/json")
return StreamingResponse(generate_stream(), media_type="text/event-stream")
except Exception as e:
import sys
import traceback
exc_info = sys.exc_info()
print("Error occurred in disagg prefill proxy server"
" - completions endpoint")
print("Error occurred in disagg prefill proxy server - completions endpoint")
print(e)
print("".join(traceback.format_exception(*exc_info)))
raise
......@@ -158,36 +169,39 @@ async def handle_chat_completions(request: Request):
req_data = await request.json()
# Send request to prefill service, ignore the response
await send_request_to_service(app.state.prefill_client,
"/chat/completions", req_data)
await send_request_to_service(
app.state.prefill_client, "/chat/completions", req_data
)
et = time.time()
stats_calculator.add(et - st)
# Stream response from decode service
async def generate_stream():
async for chunk in stream_service_response(app.state.decode_client,
"/chat/completions",
req_data):
async for chunk in stream_service_response(
app.state.decode_client, "/chat/completions", req_data
):
yield chunk
return StreamingResponse(generate_stream(),
media_type="application/json")
return StreamingResponse(generate_stream(), media_type="text/event-stream")
except Exception as e:
import sys
import traceback
exc_info = sys.exc_info()
print("Error occurred in disagg prefill proxy server "
" - chat completions endpoint")
print(
"Error occurred in disagg prefill proxy server - chat completions endpoint"
)
print(e)
print("".join(traceback.format_exception(*exc_info)))
raise
if __name__ == '__main__':
if __name__ == "__main__":
global global_args
global_args = parse_args()
import uvicorn
uvicorn.run(app, host=global_args.host, port=global_args.port)
......@@ -10,6 +10,7 @@ KV cache is transferred in the following manner:
Note that lmcache needs to be installed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache.
"""
import os
import subprocess
import time
......@@ -49,15 +50,16 @@ def run_store(store_done, prompts):
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
kv_role="kv_both")
ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
llm = LLM(
model="mistralai/Mistral-7B-Instruct-v0.2",
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8,
enforce_eager=True)
enforce_eager=True,
)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
......@@ -76,15 +78,16 @@ def run_retrieve(store_done, prompts, timeout=1):
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
kv_role="kv_both")
ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# of memory. Reduce the value if your GPU has less memory.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
llm = LLM(
model="mistralai/Mistral-7B-Instruct-v0.2",
kv_transfer_config=ktc,
max_model_len=8000,
gpu_memory_utilization=0.8,
enforce_eager=True)
enforce_eager=True,
)
print("Waiting for KV cache store to finish...")
store_done.wait()
......@@ -100,10 +103,9 @@ def run_retrieve(store_done, prompts, timeout=1):
def run_lmcache_server(port):
server_proc = subprocess.Popen([
"python", "-m", "lmcache.experimental.server", "localhost",
str(port)
])
server_proc = subprocess.Popen(
["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
)
return server_proc
......
......@@ -6,11 +6,15 @@ import json
import os
import uuid
from vllm import LLM
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
from vllm.lora.request import LoRARequest
from vllm.model_executor.model_loader.tensorizer import (
TensorizerArgs,
TensorizerConfig,
tensorize_vllm_model)
tensorize_lora_adapter,
tensorize_vllm_model,
)
from vllm.utils import FlexibleArgumentParser
# yapf conflicts with isort for this docstring
......@@ -27,7 +31,7 @@ https://github.com/coreweave/tensorizer
To serialize a model, install vLLM from source, then run something
like this from the root level of this repository:
python -m examples.other.tensorize_vllm_model \
python examples/others/tensorize_vllm_model.py \
--model facebook/opt-125m \
serialize \
--serialized-directory s3://my-bucket \
......@@ -47,7 +51,7 @@ providing a `--keyfile` argument.
To deserialize a model, you can run something like this from the root
level of this repository:
python -m examples.other.tensorize_vllm_model \
python examples/others/tensorize_vllm_model.py \
--model EleutherAI/gpt-j-6B \
--dtype float16 \
deserialize \
......@@ -65,11 +69,11 @@ shard's rank. Sharded models serialized with this script will be named as
model-rank-%03d.tensors
For more information on the available arguments for serializing, run
`python -m examples.other.tensorize_vllm_model serialize --help`.
`python -m examples.others.tensorize_vllm_model serialize --help`.
Or for deserializing:
`python -m examples.other.tensorize_vllm_model deserialize --help`.
`python examples/others/tensorize_vllm_model.py deserialize --help`.
Once a model is serialized, tensorizer can be invoked with the `LLM` class
directly to load models:
......@@ -90,11 +94,27 @@ TensorizerConfig arguments desired.
In order to see all of the available arguments usable to configure
loading with tensorizer that are given to `TensorizerConfig`, run:
`python -m examples.other.tensorize_vllm_model deserialize --help`
`python examples/others/tensorize_vllm_model.py deserialize --help`
under the `tensorizer options` section. These can also be used for
deserialization in this example script, although `--tensorizer-uri` and
`--path-to-tensors` are functionally the same in this case.
Tensorizer can also be used to save and load LoRA adapters. A LoRA adapter
can be serialized directly with the path to the LoRA adapter on HF Hub and
a TensorizerConfig object. In this script, passing a HF id to a LoRA adapter
will serialize the LoRA adapter artifacts to `--serialized-directory`.
You can then use the LoRA adapter with `vllm serve`, for instance, by ensuring
the LoRA artifacts are in your model artifacts directory and specifying
`--enable-lora`. For instance:
```
vllm serve <model_path> \
--load-format tensorizer \
--model-loader-extra-config '{"tensorizer_uri": "<model_path>.tensors"}' \
--enable-lora
```
"""
......@@ -107,6 +127,19 @@ def parse_args():
"also supported, although libsodium must be installed to "
"use it.")
parser = EngineArgs.add_cli_args(parser)
parser.add_argument(
"--lora-path",
type=str,
required=False,
help="Path to a LoRA adapter to "
"serialize along with model tensors. This can then be deserialized "
"along with the model by passing a tensorizer_config kwarg to "
"LoRARequest with type TensorizerConfig. See the docstring for this "
"for a usage example."
)
subparsers = parser.add_subparsers(dest='command')
serialize_parser = subparsers.add_parser(
......@@ -169,6 +202,37 @@ def parse_args():
def deserialize():
if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
llm = LLM(model=args.model,
load_format="tensorizer",
tensor_parallel_size=args.tensor_parallel_size,
model_loader_extra_config=tensorizer_config,
enable_lora=True,
)
sampling_params = SamplingParams(
temperature=0,
max_tokens=256,
stop=["[/assistant]"]
)
# Truncating this as the extra text isn't necessary
prompts = [
"[user] Write a SQL query to answer the question based on ..."
]
# Test LoRA load
print(
llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest("sql-lora",
1,
args.lora_path,
tensorizer_config = tensorizer_config)
)
)
else:
llm = LLM(model=args.model,
load_format="tensorizer",
tensor_parallel_size=args.tensor_parallel_size,
......@@ -197,7 +261,10 @@ if __name__ == '__main__':
model_name = model_ref.split("/")[1]
keyfile = args.keyfile if args.keyfile else None
if args.command == "serialize" or args.command == "deserialize":
keyfile = args.keyfile
else:
keyfile = None
if args.model_loader_extra_config:
config = json.loads(args.model_loader_extra_config)
......@@ -228,6 +295,10 @@ if __name__ == '__main__':
encryption_keyfile=keyfile,
**credentials)
if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
tensorize_lora_adapter(args.lora_path, tensorizer_config)
tensorize_vllm_model(engine_args, tensorizer_config)
elif args.command == "deserialize":
......
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.lint.isort]
known-first-party = ["vllm"]
[tool.ruff.format]
docstring-code-format = true
\ No newline at end of file
{{- bos_token }}
{%- if custom_tools is defined %}
{%- if custom_tools is defined and custom_tools%}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
{%- set tools_in_user_message = false %}
{%- endif %}
{%- if not tools is defined %}
{%- if tools is defined and tools %}
{%- set tool_definition = tool_definition ~ (tools | tojson(indent=4)) %}
{%- else %}
{%- set tools = none %}
{%- endif %}
{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
{%- set user_provided_system_message = true %}
{%- if messages[0]['content'] is string %}
{%- set system_message = messages[0]['content']|trim %}
{%- else %}
......@@ -19,66 +20,31 @@
{%- set messages = messages[1:] %}
{%- else %}
{%- if tools is not none %}
{#- Add default tool system message when tools are provided #}
{%- set system_message = "You are a helpful assistant with tool calling "
"capabilities. Only reply with a tool call if the function exists in the "
"library provided by the user. If it doesn't exist, just reply directly in "
"natural language. When you receive a tool call response, use the output to "
"format an answer to the original user question." %}
{#- Since not system_message was provided by user, if tool is provided, system_message is now default tool system message #}
{#- This system message is from llama website:https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/ #}
{%- set system_message = "You are a helpful assistant and an expert in function composition. You can answer general questions using your internal knowledge OR invoke functions when necessary. Follow these strict guidelines:\n\n1. FUNCTION CALLS:\n- ONLY use functions that are EXPLICITLY listed in the function list below\n- If NO functions are listed (empty function list []), respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If a function is not in the list, respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If ALL required parameters are present AND the query EXACTLY matches a listed function's purpose: output ONLY the function call(s)\n- Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\nExamples:\nCORRECT: [get_weather(location=\"Vancouver\"), calculate_route(start=\"Boston\", end=\"New York\")] <- Only if get_weather and calculate_route are in function list\nINCORRECT: get_weather(location=\"New York\")\nINCORRECT: Let me check the weather: [get_weather(location=\"New York\")]\nINCORRECT: [get_events(location=\"Singapore\")] <- If function not in list\n\n2. RESPONSE RULES:\n- For pure function requests matching a listed function: ONLY output the function call(s)\n- For knowledge questions: ONLY output text\n- For missing parameters: ONLY request the specific missing parameters\n- For unavailable services (not in function list): output ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\". Do NOT execute a function call.\n- If the query asks for information beyond what a listed function provides: output ONLY with internal knowledge about your limitations\n- NEVER combine text and function calls in the same response\n- NEVER suggest alternative functions when the requested service is unavailable\n- NEVER create or invent new functions not listed below\n\n3. STRICT BOUNDARIES:\n- ONLY use functions from the list below - no exceptions\n- NEVER use a function as an alternative to unavailable information\n- NEVER call functions not present in the function list\n- NEVER add explanatory text to function calls\n- NEVER respond with empty brackets\n- Use proper Python/JSON syntax for function calls\n- Check the function list carefully before responding\n\n4. TOOL RESPONSE HANDLING:\n- When receiving tool responses: provide concise, natural language responses\n- Don't repeat tool response verbatim\n- Don't add supplementary information\n\nHere is a list of functions in JSON format that you can invoke:\n" %}
{%- else %}
{%- set system_message = "" %}
{%- endif %}
{%- endif %}
{#- System message if the user supplied one, or if tools are used (default tool system message) #}
{#- Now writing the system message: use the user provided system message if user_provided_system_message, else default tool system message if tools presented #}
{%- if system_message %}
{#- always use user provided system message to override default tool system message #}
{{- "<|header_start|>system<|header_end|>\n\n" }}
{{- system_message }}
{%- if tools is not none and not tools_in_user_message %}
{{- "Tools: You have access to the following tools. You might need to use one "
"or more function/tool calls to fulfill the task. \n"
"If none are needed, then proceed to the response.\n\n"
"Tool Call Syntax: You can call tools using the following syntax:\n"
"[func_name1(params_name1=params_value1, params_name2=params_value2, ...), ...]\n"
"Do not include anything else when calling the tools with the syntax above.\n\n"
"Here is a list of functions in JSON format that you can invoke.\n " }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- if user_provided_system_message and tools %}
{{- "\nHere is a list of functions in JSON format that you can invoke. Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\n" }}
{{- tool_definition -}}
{%- elif tool_definition %}
{{- tool_definition -}}
{%- endif %}
{{- "<|eot|>" }}
{%- endif %}
{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and tools is not none %}
{#- Extract the first user message so we can plug it in here #}
{%- if messages | length != 0 %}
{%- if messages[0]['content'] is string %}
{%- set first_user_message = messages[0]['content']|trim %}
{%- else %}
{%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
{%- endif %}
{%- set messages = messages[1:] %}
{%- else %}
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
{{- '<|header_start|>user<|header_end|>\n\n' -}}
{{- first_user_message}}
{{- "\nHere is a list of functions in JSON format that you can invoke:"}}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{{- "Should you decide to return the function call(s), put them in the format "
"of [func_name1(params_name1=params_value1, params_name2=params_value2, "
"...), ...]\nDo not include anything else when calling the tools with the "
"syntax above." }}
{%- endif %}
{#- Now deal with all other messages #}
{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{#- Base case: messages that are not from tool role and has empty tool_call list #}
{%- if not (message.role == 'ipython' or message.role == 'tool' or ('tool_calls' in message and message.tool_calls|length != 0 )) %}
{{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
{%- if message['content'] is string %}
{{- message['content'] }}
......@@ -92,8 +58,10 @@
{%- endfor %}
{%- endif %}
{{- "<|eot|>" }}
{%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
{%- set tool_call = message.tool_calls[0].function %}
{#- Tool case: messages has non-empty tool_call list, must from assistant #}
{%- elif 'tool_calls' in message %}
{#- assume tool_calls are always coming from assistant #}
{%- if message.role == 'assistant' %}
{{- '<|header_start|>assistant<|header_end|>\n\n' -}}
{%- if message['content'] is string %}
{{- message['content'] }}
......@@ -106,20 +74,24 @@
{%- endif %}
{%- endfor %}
{%- endif %}
{{- "[" }}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- tool_call.name + '(' -}}
{%- for param in tool_call.arguments %}
{{- param + '=' -}}
{{- param + '="' -}}
{{- "%s" | format(tool_call.arguments[param]) -}}
{{- '"' -}}
{% if not loop.last %}, {% endif %}
{%- endfor %}
{{- ')' -}}
{% if not loop.last %}, {% endif %}
{%- endfor %}
{{- "<|eom|>" }}
{{- "]<|eot|>" }}
{%- endif %}
{#- Tool_response case: messages are from tool_response #}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{- "<|header_start|>ipython<|header_end|>\n\n" }}
{%- if message.content is string %}
......@@ -131,7 +103,7 @@
{%- endif %}
{%- endfor %}
{%- endif %}
{{- "<|eom|>" }}
{{- "<|eot|>" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
......
site_name: vLLM
site_url: https://docs.vllm.ai
repo_url: https://github.com/vllm-project/vllm
exclude_docs: |
*.inc.md
*.template.md
theme:
name: material
logo: assets/logos/vllm-logo-only-light.ico
favicon: assets/logos/vllm-logo-only-light.ico
palette:
# Palette toggle for automatic mode
- media: "(prefers-color-scheme)"
toggle:
icon: material/brightness-auto
name: Switch to light mode
# Palette toggle for light mode
- media: "(prefers-color-scheme: light)"
scheme: default
primary: white
toggle:
icon: material/brightness-7
name: Switch to dark mode
# Palette toggle for dark mode
- media: "(prefers-color-scheme: dark)"
scheme: slate
primary: black
toggle:
icon: material/brightness-2
name: Switch to system preference
features:
- content.code.copy
- content.tabs.link
- navigation.tracking
- navigation.tabs
- navigation.sections
- navigation.prune
- navigation.top
- search.highlight
- search.share
- toc.follow
custom_dir: docs/mkdocs/overrides
hooks:
- docs/mkdocs/hooks/remove_announcement.py
- docs/mkdocs/hooks/generate_examples.py
- docs/mkdocs/hooks/url_schemes.py
# Required to stop api-autonav from raising an error
# https://github.com/tlambert03/mkdocs-api-autonav/issues/16
nav:
- api
plugins:
- meta
- search
- autorefs
- awesome-nav
# For API reference generation
- api-autonav:
modules: ["vllm"]
api_root_uri: "api"
exclude:
- "re:vllm\\._.*" # Internal modules
- "vllm.third_party"
- "vllm.vllm_flash_attn"
- mkdocstrings:
handlers:
python:
options:
show_symbol_type_heading: true
show_symbol_type_toc: true
filters: []
summary:
modules: true
show_if_no_docstring: true
show_signature_annotations: true
separate_signature: true
show_overloads: true
signature_crossrefs: true
inventories:
- https://docs.python.org/3/objects.inv
- https://typing-extensions.readthedocs.io/en/latest/objects.inv
- https://docs.aiohttp.org/en/stable/objects.inv
- https://pillow.readthedocs.io/en/stable/objects.inv
- https://numpy.org/doc/stable/objects.inv
- https://pytorch.org/docs/stable/objects.inv
- https://psutil.readthedocs.io/en/stable/objects.inv
markdown_extensions:
- attr_list
- md_in_html
- admonition
- pymdownx.details
# For content tabs
- pymdownx.superfences
- pymdownx.tabbed:
slugify: !!python/object/apply:pymdownx.slugs.slugify
kwds:
case: lower
alternate_style: true
# For code highlighting
- pymdownx.highlight:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- pymdownx.inlinehilite
- pymdownx.snippets
# For emoji and icons
- pymdownx.emoji:
emoji_index: !!python/name:material.extensions.emoji.twemoji
emoji_generator: !!python/name:material.extensions.emoji.to_svg
# For in page [TOC] (not sidebar)
- toc:
permalink: true
# For math rendering
- mdx_math:
enable_dollar_delimiter: true
extra_css:
- mkdocs/stylesheets/extra.css
extra_javascript:
- mkdocs/javascript/run_llm_widget.js
- https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
# Makes the url format end in .html rather than act as a dir
# So index.md generates as index.html and is available under URL /index.html
# https://www.mkdocs.org/user-guide/configuration/#use_directory_urls
use_directory_urls: false
......@@ -8,6 +8,7 @@ requires = [
"setuptools-scm>=8.0",
"torch == 2.7.0",
"wheel",
"regex",
"jinja2",
]
build-backend = "setuptools.build_meta"
......@@ -35,8 +36,8 @@ dynamic = [ "version", "dependencies", "optional-dependencies"]
[project.urls]
Homepage="https://github.com/vllm-project/vllm"
Documentation="https://vllm.readthedocs.io/en/latest/"
Slack="http://slack.vllm.ai/"
Documentation="https://docs.vllm.ai/en/latest/"
Slack="https://slack.vllm.ai/"
[project.scripts]
vllm = "vllm.entrypoints.cli.main:main"
......@@ -56,16 +57,12 @@ ignore_patterns = [
".buildkite/**",
"benchmarks/**",
"build/**",
"examples/**",
]
[tool.ruff]
# Allow lines to be as long as 80.
line-length = 80
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
......@@ -148,6 +145,7 @@ skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora
skip_glob = [
".buildkite/*",
"benchmarks/*",
"examples/*",
]
use_parentheses = true
skip_gitignore = true
......@@ -165,9 +163,12 @@ markers = [
[tool.pymarkdown]
plugins.md004.style = "sublist" # ul-style
plugins.md007.indent = 4 # ul-indent
plugins.md007.start_indented = true # ul-indent
plugins.md013.enabled = false # line-length
plugins.md041.enabled = false # first-line-h1
plugins.md033.enabled = false # inline-html
plugins.md046.enabled = false # code-block-style
plugins.md024.allow_different_nesting = true # no-duplicate-headers
[tool.ty]
......
......@@ -7,3 +7,4 @@ setuptools-scm>=8
torch==2.7.0
wheel
jinja2>=3.1.6
regex
regex # Replace re for higher-performance regex matching
cachetools
psutil
sentencepiece # Required for LLaMA tokenizer.
......@@ -7,7 +8,7 @@ tqdm
blake3
py-cpuinfo
transformers >= 4.51.1
huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads.
huggingface-hub[hf_xet] >= 0.32.0 # Required for Xet downloads.
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
......@@ -40,7 +41,7 @@ compressed-tensors == 0.9.4 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/other/logging_configuration.md
python-json-logger # Used by logging as per examples/others/logging_configuration.md
scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu
opentelemetry-sdk>=1.26.0 # vllm.tracing
......
......@@ -2,11 +2,12 @@
-r common.txt
# Dependencies for CPUs
packaging>=24.2
setuptools>=77.0.3,<80.0.0
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.7.0+cpu; platform_machine == "x86_64"
torch==2.7.0; platform_system == "Darwin"
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
torch==2.7.0.dev20250304; platform_machine == "s390x"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
......@@ -19,3 +20,7 @@ datasets # for benchmark scripts
# cpu cannot use triton 3.3.0
triton==3.2.0; platform_machine == "x86_64"
# Intel Extension for PyTorch, only for x86_64 CPUs
intel-openmp==2024.2.1; platform_machine == "x86_64"
intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
sphinx==7.4.7
sphinx-argparse==0.5.2
sphinx-book-theme==1.1.4
sphinx-copybutton==0.5.2
sphinx-design==0.6.1
sphinx-togglebutton==0.3.2
myst-parser==3.0.1 # `myst-parser==4.0.1` breaks inline code in titles
msgspec
snowballstemmer<3 # https://github.com/snowballstem/snowball/issues/229
commonmark # Required by sphinx-argparse when using :markdownhelp:
# Custom autodoc2 is necessary for faster docstring processing
# see: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33#issuecomment-2856386035
git+https://github.com/hmellor/sphinx-autodoc2.git # sphinx-autodoc2==0.5.0
# packages to install to build the documentation
cachetools
-f https://download.pytorch.org/whl/cpu
torch
\ No newline at end of file
mkdocs
mkdocs-api-autonav
mkdocs-material
mkdocstrings-python
mkdocs-gen-files
mkdocs-awesome-nav
python-markdown-math
regex
ruff
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment