Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

4eabe123 · zhuwenwen · 45840cd2 · 58738772 · 4eabe123 · 4eabe123
Commit 4eabe123 authored May 28, 2025 by zhuwenwen
20 changed files
--- a/examples/lmcache/README.md
+++ b/examples/lmcache/README.md
--- a/examples/lmcache/cpu_offload_lmcache.py
+++ b/examples/lmcache/cpu_offload_lmcache.py
@@ -20,6 +20,7 @@ Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
 Learn more about LMCache environment setup, please refer to:
 https://docs.lmcache.ai/getting_started/installation.html
 """
+
 import argparse
 import contextlib
 import os
@@ -49,8 +50,7 @@ def setup_environment_variables(vllm_version: str):


 @contextlib.contextmanager
-def build_llm_with_lmcache(lmcache_connector: str, model: str,
-                           vllm_version: str):
+def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str):
    ktc = KVTransferConfig(
        kv_connector=lmcache_connector,
        kv_role="kv_both",
@@ -97,18 +97,19 @@ def print_output(
    for output in outputs:
        generated_text = output.outputs[0].text
        print(f"Generated text: {generated_text!r}")
-    print(f"Generation took {time.time() - start:.2f} seconds, "
-          f"{req_str} request done.")
+    print(f"Generation took {time.time() - start:.2f} seconds, {req_str} request done.")
    print("-" * 50)


 def parse_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument("-v",
+    parser.add_argument(
+        "-v",
        "--version",
        choices=["v0", "v1"],
        default="v1",
-                        help="Specify vLLM version (default: v1)")
+        help="Specify vLLM version (default: v1)",
+    )
    return parser.parse_args()


@@ -125,7 +126,6 @@ def main():
    setup_environment_variables(args.version)

    with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
-
        # This example script runs two requests with a shared prefix.
        # Define the shared prompt and specific prompts
        shared_prompt = "Hello, how are you?" * 1000
@@ -136,9 +136,7 @@ def main():
            shared_prompt + "Tell me a very long story",
        ]

-        sampling_params = SamplingParams(temperature=0,
-                                         top_p=0.95,
-                                         max_tokens=10)
+        sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)

        # Print the first output
        print_output(llm, first_prompt, sampling_params, "first")

--- a/examples/lmcache/disagg_prefill_lmcache_v0.py
+++ b/examples/lmcache/disagg_prefill_lmcache_v0.py
@@ -10,6 +10,7 @@ vLLM prefill node -> LMCache server -> vLLM decode node.
 Note that `pip install lmcache` is needed to run this example.
 Learn more about LMCache in https://github.com/LMCache/LMCache.
 """
+
 import os
 import subprocess
 import time
@@ -49,19 +50,23 @@ def run_prefill(prefill_done, prompts):

    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)

-    ktc = KVTransferConfig(kv_connector="LMCacheConnector",
+    ktc = KVTransferConfig(
+        kv_connector="LMCacheConnector",
        kv_role="kv_producer",
        kv_rank=0,
-                           kv_parallel_size=2)
+        kv_parallel_size=2,
+    )
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
    # memory. Reduce the value if your GPU has less memory.
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+    llm = LLM(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
        kv_transfer_config=ktc,
        max_model_len=8000,
        gpu_memory_utilization=0.8,
-              enforce_eager=True)
+        enforce_eager=True,
+    )

-    #llm.generate(prompts, sampling_params)
+    # llm.generate(prompts, sampling_params)
    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        generated_text = output.outputs[0].text
@@ -79,17 +84,21 @@ def run_decode(prefill_done, prompts, timeout=1):

    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)

-    ktc = KVTransferConfig(kv_connector="LMCacheConnector",
+    ktc = KVTransferConfig(
+        kv_connector="LMCacheConnector",
        kv_role="kv_consumer",
        kv_rank=1,
-                           kv_parallel_size=2)
+        kv_parallel_size=2,
+    )
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
    # of memory. Reduce the value if your GPU has less memory.
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+    llm = LLM(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
        kv_transfer_config=ktc,
        max_model_len=8000,
        gpu_memory_utilization=0.8,
-              enforce_eager=True)
+        enforce_eager=True,
+    )

    print("Waiting for prefill node to finish...")
    prefill_done.wait()
@@ -105,10 +114,9 @@ def run_decode(prefill_done, prompts, timeout=1):


 def run_lmcache_server(port):
-    server_proc = subprocess.Popen([
-        "python", "-m", "lmcache.experimental.server", "localhost",
-        str(port)
-    ])
+    server_proc = subprocess.Popen(
+        ["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
+    )
    return server_proc



--- a/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
+++ b/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
--- a/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
+++ b/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
--- a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
+++ b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
--- a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
+++ b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
@@ -17,13 +17,17 @@ async def lifespan(app: FastAPI):
    Lifespan context manager to handle startup and shutdown events.
    """
    # Startup: Initialize clients
-    prefiller_base_url = f'http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1'
-    decoder_base_url = f'http://{global_args.decoder_host}:{global_args.decoder_port}/v1'
-
-    app.state.prefill_client = httpx.AsyncClient(timeout=None,
-                                                 base_url=prefiller_base_url)
-    app.state.decode_client = httpx.AsyncClient(timeout=None,
-                                                base_url=decoder_base_url)
+    prefiller_base_url = (
+        f"http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1"
+    )
+    decoder_base_url = (
+        f"http://{global_args.decoder_host}:{global_args.decoder_port}/v1"
+    )
+
+    app.state.prefill_client = httpx.AsyncClient(
+        timeout=None, base_url=prefiller_base_url
+    )
+    app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url)

    yield

@@ -37,7 +41,6 @@ app = FastAPI(lifespan=lifespan)


 class StatsCalculator:
-
    def __init__(self):
        self._stats = []
        self._last_log_time = time.time()
@@ -51,13 +54,18 @@ class StatsCalculator:
    def _log_stats(self):
        # Print average, median, and 99th percentile
        np_arr = np.array(self._stats)
-        output_str = f"\nNum requests: {len(self._stats)}" + \
-                "\nPrefill node TTFT stats:" + \
-                f"\n - Average (ms): {np.mean(np_arr)}" + \
-                f"\n - Median (ms): {np.median(np_arr)}" + \
-                f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n"
-        print("===============================", output_str,
-              "===============================")
+        output_str = (
+            f"\nNum requests: {len(self._stats)}"
+            + "\nPrefill node TTFT stats:"
+            + f"\n - Average (ms): {np.mean(np_arr)}"
+            + f"\n - Median (ms): {np.median(np_arr)}"
+            + f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n"
+        )
+        print(
+            "===============================",
+            output_str,
+            "===============================",
+        )


 stats_calculator = StatsCalculator()
@@ -82,15 +90,16 @@ app.state.prefill_client = None
 app.state.decode_client = None


-async def send_request_to_service(client: httpx.AsyncClient, endpoint: str,
-                                  req_data: dict):
+async def send_request_to_service(
+    client: httpx.AsyncClient, endpoint: str, req_data: dict
+):
    """
    Send a request to a service using a persistent client.
    """
    req_data = req_data.copy()
-    req_data['max_tokens'] = 1
-    if 'max_completion_tokens' in req_data:
-        req_data['max_completion_tokens'] = 1
+    req_data["max_tokens"] = 1
+    if "max_completion_tokens" in req_data:
+        req_data["max_completion_tokens"] = 1

    headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
    response = await client.post(endpoint, json=req_data, headers=headers)
@@ -98,14 +107,16 @@ async def send_request_to_service(client: httpx.AsyncClient, endpoint: str,
    return response


-async def stream_service_response(client: httpx.AsyncClient, endpoint: str,
-                                  req_data: dict):
+async def stream_service_response(
+    client: httpx.AsyncClient, endpoint: str, req_data: dict
+):
    """
    Asynchronously stream the response from a service using a persistent client.
    """
    headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-    async with client.stream("POST", endpoint, json=req_data,
-                             headers=headers) as response:
+    async with client.stream(
+        "POST", endpoint, json=req_data, headers=headers
+    ) as response:
        response.raise_for_status()
        async for chunk in response.aiter_bytes():
            yield chunk
@@ -121,28 +132,28 @@ async def handle_completions(request: Request):
        req_data = await request.json()

        # Send request to prefill service, ignore the response
-        await send_request_to_service(app.state.prefill_client, "/completions",
-                                      req_data)
+        await send_request_to_service(
+            app.state.prefill_client, "/completions", req_data
+        )

        et = time.time()
        stats_calculator.add(et - st)

        # Stream response from decode service
        async def generate_stream():
-            async for chunk in stream_service_response(app.state.decode_client,
-                                                       "/completions",
-                                                       req_data):
+            async for chunk in stream_service_response(
+                app.state.decode_client, "/completions", req_data
+            ):
                yield chunk

-        return StreamingResponse(generate_stream(),
-                                 media_type="application/json")
+        return StreamingResponse(generate_stream(), media_type="text/event-stream")

    except Exception as e:
        import sys
        import traceback
+
        exc_info = sys.exc_info()
-        print("Error occurred in disagg prefill proxy server"
-              " - completions endpoint")
+        print("Error occurred in disagg prefill proxy server - completions endpoint")
        print(e)
        print("".join(traceback.format_exception(*exc_info)))
        raise
@@ -158,36 +169,39 @@ async def handle_chat_completions(request: Request):
        req_data = await request.json()

        # Send request to prefill service, ignore the response
-        await send_request_to_service(app.state.prefill_client,
-                                      "/chat/completions", req_data)
+        await send_request_to_service(
+            app.state.prefill_client, "/chat/completions", req_data
+        )

        et = time.time()
        stats_calculator.add(et - st)

        # Stream response from decode service
        async def generate_stream():
-            async for chunk in stream_service_response(app.state.decode_client,
-                                                       "/chat/completions",
-                                                       req_data):
+            async for chunk in stream_service_response(
+                app.state.decode_client, "/chat/completions", req_data
+            ):
                yield chunk

-        return StreamingResponse(generate_stream(),
-                                 media_type="application/json")
+        return StreamingResponse(generate_stream(), media_type="text/event-stream")

    except Exception as e:
        import sys
        import traceback
+
        exc_info = sys.exc_info()
-        print("Error occurred in disagg prefill proxy server "
-              " - chat completions endpoint")
+        print(
+            "Error occurred in disagg prefill proxy server  - chat completions endpoint"
+        )
        print(e)
        print("".join(traceback.format_exception(*exc_info)))
        raise


-if __name__ == '__main__':
+if __name__ == "__main__":
    global global_args
    global_args = parse_args()

    import uvicorn
+
    uvicorn.run(app, host=global_args.host, port=global_args.port)
--- a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
+++ b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
--- a/examples/lmcache/kv_cache_sharing_lmcache_v1.py
+++ b/examples/lmcache/kv_cache_sharing_lmcache_v1.py
@@ -10,6 +10,7 @@ KV cache is transferred in the following manner:
 Note that lmcache needs to be installed to run this example.
 Learn more about LMCache in https://github.com/LMCache/LMCache.
 """
+
 import os
 import subprocess
 import time
@@ -49,15 +50,16 @@ def run_store(store_done, prompts):

    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)

-    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
-                           kv_role="kv_both")
+    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
    # memory. Reduce the value if your GPU has less memory.
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+    llm = LLM(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
        kv_transfer_config=ktc,
        max_model_len=8000,
        gpu_memory_utilization=0.8,
-              enforce_eager=True)
+        enforce_eager=True,
+    )

    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
@@ -76,15 +78,16 @@ def run_retrieve(store_done, prompts, timeout=1):

    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)

-    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
-                           kv_role="kv_both")
+    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
    # of memory. Reduce the value if your GPU has less memory.
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+    llm = LLM(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
        kv_transfer_config=ktc,
        max_model_len=8000,
        gpu_memory_utilization=0.8,
-              enforce_eager=True)
+        enforce_eager=True,
+    )

    print("Waiting for KV cache store to finish...")
    store_done.wait()
@@ -100,10 +103,9 @@ def run_retrieve(store_done, prompts, timeout=1):


 def run_lmcache_server(port):
-    server_proc = subprocess.Popen([
-        "python", "-m", "lmcache.experimental.server", "localhost",
-        str(port)
-    ])
+    server_proc = subprocess.Popen(
+        ["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
+    )
    return server_proc



--- a/examples/other/logging_configuration.md
+++ b/examples/other/logging_configuration.md
--- a/examples/other/tensorize_vllm_model.py
+++ b/examples/other/tensorize_vllm_model.py
@@ -6,11 +6,15 @@ import json
 import os
 import uuid

-from vllm import LLM
+from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.model_loader.tensorizer import (
+    TensorizerArgs,
    TensorizerConfig,
-                                                         tensorize_vllm_model)
+    tensorize_lora_adapter,
+    tensorize_vllm_model,
+)
 from vllm.utils import FlexibleArgumentParser

 # yapf conflicts with isort for this docstring
@@ -27,7 +31,7 @@ https://github.com/coreweave/tensorizer
 To serialize a model, install vLLM from source, then run something 
 like this from the root level of this repository:

-python -m examples.other.tensorize_vllm_model \
+python examples/others/tensorize_vllm_model.py \
   --model facebook/opt-125m \
   serialize \
   --serialized-directory s3://my-bucket \
@@ -47,7 +51,7 @@ providing a `--keyfile` argument.
 To deserialize a model, you can run something like this from the root 
 level of this repository:

-python -m examples.other.tensorize_vllm_model \
+python examples/others/tensorize_vllm_model.py \
   --model EleutherAI/gpt-j-6B \
   --dtype float16 \
   deserialize \
@@ -65,11 +69,11 @@ shard's rank. Sharded models serialized with this script will be named as
 model-rank-%03d.tensors

 For more information on the available arguments for serializing, run 
-`python -m examples.other.tensorize_vllm_model serialize --help`.
+`python -m examples.others.tensorize_vllm_model serialize --help`.

 Or for deserializing:

-`python -m examples.other.tensorize_vllm_model deserialize --help`.
+`python examples/others/tensorize_vllm_model.py deserialize --help`.

 Once a model is serialized, tensorizer can be invoked with the `LLM` class 
 directly to load models:
@@ -90,11 +94,27 @@ TensorizerConfig arguments desired.
 In order to see all of the available arguments usable to configure 
 loading with tensorizer that are given to `TensorizerConfig`, run:

-`python -m examples.other.tensorize_vllm_model deserialize --help`
+`python examples/others/tensorize_vllm_model.py deserialize --help`

 under the `tensorizer options` section. These can also be used for
 deserialization in this example script, although `--tensorizer-uri` and
 `--path-to-tensors` are functionally the same in this case.
+
+Tensorizer can also be used to save and load LoRA adapters. A LoRA adapter
+can be serialized directly with the path to the LoRA adapter on HF Hub and
+a TensorizerConfig object. In this script, passing a HF id to a LoRA adapter
+will serialize the LoRA adapter artifacts to `--serialized-directory`.
+
+You can then use the LoRA adapter with `vllm serve`, for instance, by ensuring 
+the LoRA artifacts are in your model artifacts directory and specifying 
+`--enable-lora`. For instance:
+
+```
+vllm serve <model_path> \
+    --load-format tensorizer \
+    --model-loader-extra-config '{"tensorizer_uri": "<model_path>.tensors"}' \
+    --enable-lora
+```
 """


@@ -107,6 +127,19 @@ def parse_args():
        "also supported, although libsodium must be installed to "
        "use it.")
    parser = EngineArgs.add_cli_args(parser)
+
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        required=False,
+        help="Path to a LoRA adapter to "
+        "serialize along with model tensors. This can then be deserialized "
+        "along with the model by passing a tensorizer_config kwarg to "
+        "LoRARequest with type TensorizerConfig. See the docstring for this "
+        "for a usage example."
+
+    )
+
    subparsers = parser.add_subparsers(dest='command')

    serialize_parser = subparsers.add_parser(
@@ -169,6 +202,37 @@ def parse_args():


 def deserialize():
+    if args.lora_path:
+        tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
+        llm = LLM(model=args.model,
+                  load_format="tensorizer",
+                  tensor_parallel_size=args.tensor_parallel_size,
+                  model_loader_extra_config=tensorizer_config,
+                  enable_lora=True,
+        )
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=256,
+            stop=["[/assistant]"]
+        )
+
+        # Truncating this as the extra text isn't necessary
+        prompts = [
+            "[user] Write a SQL query to answer the question based on ..."
+        ]
+
+        # Test LoRA load
+        print(
+            llm.generate(
+            prompts,
+            sampling_params,
+            lora_request=LoRARequest("sql-lora",
+                                     1,
+                                     args.lora_path,
+                                     tensorizer_config = tensorizer_config)
+            )
+        )
+    else:
        llm = LLM(model=args.model,
                  load_format="tensorizer",
                  tensor_parallel_size=args.tensor_parallel_size,
@@ -197,7 +261,10 @@ if __name__ == '__main__':

    model_name = model_ref.split("/")[1]

-    keyfile = args.keyfile if args.keyfile else None
+    if args.command == "serialize" or args.command == "deserialize":
+        keyfile = args.keyfile
+    else:
+        keyfile = None

    if args.model_loader_extra_config:
        config = json.loads(args.model_loader_extra_config)
@@ -228,6 +295,10 @@ if __name__ == '__main__':
            encryption_keyfile=keyfile,
            **credentials)

+        if args.lora_path:
+            tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
+            tensorize_lora_adapter(args.lora_path, tensorizer_config)
+
        tensorize_vllm_model(engine_args, tensorizer_config)

    elif args.command == "deserialize":

--- a/examples/pyproject.toml
+++ b/examples/pyproject.toml
+# This local pyproject file is part of the migration from yapf to ruff format.
+# It uses the same core rules as the main pyproject.toml file, but with the
+# following differences:
+# - ruff line length is overridden to 88
+# - deprecated typing ignores (UP006, UP035) have been removed
+
+[tool.ruff]
+line-length = 88
+exclude = [
+    # External file, leaving license intact
+    "examples/other/fp8/quantizer/quantize.py",
+    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
+]
+
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["vllm"]
+
+[tool.ruff.format]
+docstring-code-format = true
\ No newline at end of file
--- a/examples/tool_chat_template_llama4_pythonic.jinja
+++ b/examples/tool_chat_template_llama4_pythonic.jinja
 {{- bos_token }}
-{%- if custom_tools is defined %}
+{%- if custom_tools is defined and custom_tools%}
    {%- set tools = custom_tools %}
 {%- endif %}
-{%- if not tools_in_user_message is defined %}
-    {%- set tools_in_user_message = false %}
-{%- endif %}
-{%- if not tools is defined %}
+{%- if tools is defined and tools %}
+    {%- set tool_definition = tool_definition ~ (tools | tojson(indent=4)) %}
+{%- else %}
    {%- set tools = none %}
 {%- endif %}

+
 {#- This block extracts the system message, so we can slot it into the right place. #}
 {%- if messages[0]['role'] == 'system' %}
+    {%- set user_provided_system_message = true %}
    {%- if messages[0]['content'] is string %}
        {%- set system_message = messages[0]['content']|trim %}
    {%- else %}
@@ -19,66 +20,31 @@
    {%- set messages = messages[1:] %}
 {%- else %}
    {%- if tools is not none  %}
-        {#- Add default tool system message when tools are provided #}
-        {%- set system_message = "You are a helpful assistant with tool calling "
-            "capabilities. Only reply with a tool call if the function exists in the "
-            "library provided by the user. If it doesn't exist, just reply directly in "
-            "natural language. When you receive a tool call response, use the output to "
-            "format an answer to the original user question." %}
+        {#- Since not system_message was provided by user, if tool is provided, system_message is now default tool system message #}
+        {#- This system message is from llama website:https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/  #}
+        {%- set system_message = "You are a helpful assistant and an expert in function composition. You can answer general questions using your internal knowledge OR invoke functions when necessary. Follow these strict guidelines:\n\n1. FUNCTION CALLS:\n- ONLY use functions that are EXPLICITLY listed in the function list below\n- If NO functions are listed (empty function list []), respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If a function is not in the list, respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If ALL required parameters are present AND the query EXACTLY matches a listed function's purpose: output ONLY the function call(s)\n- Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\nExamples:\nCORRECT: [get_weather(location=\"Vancouver\"), calculate_route(start=\"Boston\", end=\"New York\")] <- Only if get_weather and calculate_route are in function list\nINCORRECT: get_weather(location=\"New York\")\nINCORRECT: Let me check the weather: [get_weather(location=\"New York\")]\nINCORRECT: [get_events(location=\"Singapore\")] <- If function not in list\n\n2. RESPONSE RULES:\n- For pure function requests matching a listed function: ONLY output the function call(s)\n- For knowledge questions: ONLY output text\n- For missing parameters: ONLY request the specific missing parameters\n- For unavailable services (not in function list): output ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\". Do NOT execute a function call.\n- If the query asks for information beyond what a listed function provides: output ONLY with internal knowledge about your limitations\n- NEVER combine text and function calls in the same response\n- NEVER suggest alternative functions when the requested service is unavailable\n- NEVER create or invent new functions not listed below\n\n3. STRICT BOUNDARIES:\n- ONLY use functions from the list below - no exceptions\n- NEVER use a function as an alternative to unavailable information\n- NEVER call functions not present in the function list\n- NEVER add explanatory text to function calls\n- NEVER respond with empty brackets\n- Use proper Python/JSON syntax for function calls\n- Check the function list carefully before responding\n\n4. TOOL RESPONSE HANDLING:\n- When receiving tool responses: provide concise, natural language responses\n- Don't repeat tool response verbatim\n- Don't add supplementary information\n\nHere is a list of functions in JSON format that you can invoke:\n" %}
    {%- else %}
        {%- set system_message = "" %}
    {%- endif %}
 {%- endif %}
-
-{#- System message if the user supplied one, or if tools are used (default tool system message) #}
+{#- Now writing the system message: use the user provided system message if user_provided_system_message, else default tool system message if tools presented #}
 {%- if system_message %}
    {#- always use user provided system message to override default tool system message #}
    {{- "<|header_start|>system<|header_end|>\n\n" }}
    {{- system_message }}
-    {%- if tools is not none and not tools_in_user_message %}
-        {{- "Tools: You have access to the following tools. You might need to use one "
-            "or more function/tool calls to fulfill the task. \n"
-            "If none are needed, then proceed to the response.\n\n"
-            "Tool Call Syntax: You can call tools using the following syntax:\n"
-            "[func_name1(params_name1=params_value1, params_name2=params_value2, ...), ...]\n"
-            "Do not include anything else when calling the tools with the syntax above.\n\n"
-            "Here is a list of functions in JSON format that you can invoke.\n " }}
-        {%- for t in tools %}
-            {{- t | tojson(indent=4) }}
-            {{- "\n\n" }}
-        {%- endfor %}
+    {%- if user_provided_system_message and tools %}
+        {{- "\nHere is a list of functions in JSON format that you can invoke. Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\n" }}
+        {{- tool_definition -}}
+        {%- elif tool_definition %}
+        {{- tool_definition -}}
    {%- endif %}
    {{- "<|eot|>" }}
 {%- endif %}

-{#- Custom tools are passed in a user message with some extra guidance #}
-{%- if tools_in_user_message and tools is not none %}
-    {#- Extract the first user message so we can plug it in here #}
-    {%- if messages | length != 0 %}
-        {%- if messages[0]['content'] is string %}
-            {%- set first_user_message = messages[0]['content']|trim %}
-        {%- else %}
-            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
-        {%- endif %}
-        {%- set messages = messages[1:] %}
-    {%- else %}
-        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
-    {%- endif %}
-    {{- '<|header_start|>user<|header_end|>\n\n' -}}
-    {{- first_user_message}}
-    {{- "\nHere is a list of functions in JSON format that you can invoke:"}}
-    {%- for t in tools %}
-        {{- t | tojson(indent=4) }}
-        {{- "\n\n" }}
-    {%- endfor %}
-    {{- "Should you decide to return the function call(s), put them in the format "
-        "of [func_name1(params_name1=params_value1, params_name2=params_value2, "
-        "...), ...]\nDo not include anything else when calling the tools with the "
-        "syntax above." }}
-{%- endif %}
-
+{#- Now deal with all other messages #}
 {%- for message in messages %}
-    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+    {#- Base case: messages that are not from tool role and has empty tool_call list  #}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or ('tool_calls' in message and  message.tool_calls|length != 0 )) %}
        {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
        {%- if message['content'] is string %}
            {{- message['content'] }}
@@ -92,8 +58,10 @@
            {%- endfor %}
        {%- endif %}
    {{- "<|eot|>" }}
-    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
-        {%- set tool_call = message.tool_calls[0].function %}
+    {#- Tool case: messages has non-empty tool_call list, must from assistant #}
+    {%- elif 'tool_calls' in message %}
+        {#- assume tool_calls are always coming from assistant #}
+        {%- if message.role == 'assistant' %}
            {{- '<|header_start|>assistant<|header_end|>\n\n' -}}
        {%- if message['content'] is string %}
            {{- message['content'] }}
@@ -106,20 +74,24 @@
                {%- endif %}
            {%- endfor %}
        {%- endif %}
+            {{- "[" }}
        {%- for tool_call in message.tool_calls %}
            {%- if tool_call.function is defined %}
                {%- set tool_call = tool_call.function %}
            {%- endif %}
                {{-  tool_call.name + '(' -}}
            {%- for param in tool_call.arguments %}
-                {{- param + '=' -}}
+                {{- param + '="' -}}
                {{- "%s" | format(tool_call.arguments[param]) -}}
+                {{- '"' -}}
                {% if not loop.last %}, {% endif %}
            {%- endfor %}
            {{- ')' -}}
            {% if not loop.last %}, {% endif %}
        {%- endfor %}
-        {{- "<|eom|>" }}
+        {{- "]<|eot|>" }}
+{%- endif %}
+{#- Tool_response case: messages are from tool_response  #}
    {%- elif message.role == "tool" or message.role == "ipython" %}
        {{- "<|header_start|>ipython<|header_end|>\n\n" }}
        {%- if message.content is string %}
@@ -131,7 +103,7 @@
                {%- endif %}
            {%- endfor %}
        {%- endif %}
-        {{- "<|eom|>" }}
+        {{- "<|eot|>" }}
    {%- endif %}
 {%- endfor %}
 {%- if add_generation_prompt %}

--- a/mkdocs.yaml
+++ b/mkdocs.yaml
+site_name: vLLM
+site_url: https://docs.vllm.ai
+repo_url: https://github.com/vllm-project/vllm
+exclude_docs: |
+  *.inc.md
+  *.template.md
+theme:
+  name: material
+  logo: assets/logos/vllm-logo-only-light.ico
+  favicon: assets/logos/vllm-logo-only-light.ico
+  palette:
+    # Palette toggle for automatic mode
+    - media: "(prefers-color-scheme)"
+      toggle:
+        icon: material/brightness-auto
+        name: Switch to light mode
+    # Palette toggle for light mode
+    - media: "(prefers-color-scheme: light)"
+      scheme: default 
+      primary: white
+      toggle:
+        icon: material/brightness-7
+        name: Switch to dark mode
+    # Palette toggle for dark mode
+    - media: "(prefers-color-scheme: dark)"
+      scheme: slate
+      primary: black
+      toggle:
+        icon: material/brightness-2
+        name: Switch to system preference
+  features:
+    - content.code.copy
+    - content.tabs.link
+    - navigation.tracking
+    - navigation.tabs
+    - navigation.sections
+    - navigation.prune
+    - navigation.top
+    - search.highlight
+    - search.share
+    - toc.follow
+  custom_dir: docs/mkdocs/overrides
+
+hooks:
+  - docs/mkdocs/hooks/remove_announcement.py
+  - docs/mkdocs/hooks/generate_examples.py
+  - docs/mkdocs/hooks/url_schemes.py
+
+# Required to stop api-autonav from raising an error
+# https://github.com/tlambert03/mkdocs-api-autonav/issues/16
+nav:
+  - api
+
+plugins:
+  - meta
+  - search
+  - autorefs
+  - awesome-nav
+  # For API reference generation
+  - api-autonav:
+      modules: ["vllm"]
+      api_root_uri: "api"
+      exclude:
+        - "re:vllm\\._.*"  # Internal modules
+        - "vllm.third_party"
+        - "vllm.vllm_flash_attn"
+  - mkdocstrings:
+      handlers:
+        python:
+          options:
+            show_symbol_type_heading: true
+            show_symbol_type_toc: true
+            filters: []
+            summary:
+              modules: true
+            show_if_no_docstring: true
+            show_signature_annotations: true
+            separate_signature: true
+            show_overloads: true
+            signature_crossrefs: true
+          inventories:
+          - https://docs.python.org/3/objects.inv
+          - https://typing-extensions.readthedocs.io/en/latest/objects.inv
+          - https://docs.aiohttp.org/en/stable/objects.inv
+          - https://pillow.readthedocs.io/en/stable/objects.inv
+          - https://numpy.org/doc/stable/objects.inv
+          - https://pytorch.org/docs/stable/objects.inv
+          - https://psutil.readthedocs.io/en/stable/objects.inv
+
+markdown_extensions:
+  - attr_list
+  - md_in_html
+  - admonition
+  - pymdownx.details
+  # For content tabs
+  - pymdownx.superfences
+  - pymdownx.tabbed:
+      slugify: !!python/object/apply:pymdownx.slugs.slugify
+        kwds:
+          case: lower
+      alternate_style: true
+  # For code highlighting
+  - pymdownx.highlight:
+      anchor_linenums: true
+      line_spans: __span
+      pygments_lang_class: true
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
+  # For emoji and icons
+  - pymdownx.emoji:
+      emoji_index: !!python/name:material.extensions.emoji.twemoji
+      emoji_generator: !!python/name:material.extensions.emoji.to_svg
+  # For in page [TOC] (not sidebar)
+  - toc:
+      permalink: true
+  # For math rendering
+  - mdx_math:
+      enable_dollar_delimiter: true
+
+extra_css:
+  - mkdocs/stylesheets/extra.css
+
+extra_javascript:
+  - mkdocs/javascript/run_llm_widget.js
+  - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
+
+# Makes the url format end in .html rather than act as a dir
+# So index.md generates as index.html and is available under URL /index.html
+# https://www.mkdocs.org/user-guide/configuration/#use_directory_urls
+use_directory_urls: false
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,7 @@ requires = [
    "setuptools-scm>=8.0",
    "torch == 2.7.0",
    "wheel",
+    "regex",
    "jinja2",
 ]
 build-backend = "setuptools.build_meta"
@@ -35,8 +36,8 @@ dynamic = [ "version", "dependencies", "optional-dependencies"]

 [project.urls]
 Homepage="https://github.com/vllm-project/vllm"
-Documentation="https://vllm.readthedocs.io/en/latest/"
-Slack="http://slack.vllm.ai/"
+Documentation="https://docs.vllm.ai/en/latest/"
+Slack="https://slack.vllm.ai/"

 [project.scripts]
 vllm = "vllm.entrypoints.cli.main:main"
@@ -56,16 +57,12 @@ ignore_patterns = [
    ".buildkite/**",
    "benchmarks/**",
    "build/**",
+    "examples/**",
 ]

 [tool.ruff]
 # Allow lines to be as long as 80.
 line-length = 80
-exclude = [
-    # External file, leaving license intact
-    "examples/other/fp8/quantizer/quantize.py",
-    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
-]

 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
@@ -148,6 +145,7 @@ skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora
 skip_glob = [
    ".buildkite/*",
    "benchmarks/*",
+    "examples/*",
 ]
 use_parentheses = true
 skip_gitignore = true
@@ -165,9 +163,12 @@ markers = [

 [tool.pymarkdown]
 plugins.md004.style = "sublist" # ul-style
+plugins.md007.indent = 4 # ul-indent
+plugins.md007.start_indented = true # ul-indent
 plugins.md013.enabled = false # line-length
 plugins.md041.enabled = false # first-line-h1
 plugins.md033.enabled = false # inline-html
+plugins.md046.enabled = false # code-block-style
 plugins.md024.allow_different_nesting = true # no-duplicate-headers

 [tool.ty]

--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -7,3 +7,4 @@ setuptools-scm>=8
 torch==2.7.0
 wheel
 jinja2>=3.1.6
+regex
--- a/requirements/common.txt
+++ b/requirements/common.txt
+regex # Replace re for higher-performance regex matching
 cachetools
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
@@ -7,7 +8,7 @@ tqdm
 blake3
 py-cpuinfo
 transformers >= 4.51.1
-huggingface-hub[hf_xet] >= 0.30.0  # Required for Xet downloads.
+huggingface-hub[hf_xet] >= 0.32.0  # Required for Xet downloads.
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
@@ -40,7 +41,7 @@ compressed-tensors == 0.9.4 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
-python-json-logger # Used by logging as per examples/other/logging_configuration.md
+python-json-logger # Used by logging as per examples/others/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
 opentelemetry-sdk>=1.26.0  # vllm.tracing

--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -2,11 +2,12 @@
 -r common.txt

 # Dependencies for CPUs
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.7.0+cpu; platform_machine == "x86_64"
 torch==2.7.0; platform_system == "Darwin"
 torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
-torch==2.7.0.dev20250304; platform_machine == "s390x"

 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
@@ -19,3 +20,7 @@ datasets # for benchmark scripts

 # cpu cannot use triton 3.3.0
 triton==3.2.0; platform_machine == "x86_64"
+
+# Intel Extension for PyTorch, only for x86_64 CPUs
+intel-openmp==2024.2.1; platform_machine == "x86_64"
+intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
-sphinx==7.4.7
-sphinx-argparse==0.5.2
-sphinx-book-theme==1.1.4
-sphinx-copybutton==0.5.2
-sphinx-design==0.6.1
-sphinx-togglebutton==0.3.2
-myst-parser==3.0.1  # `myst-parser==4.0.1` breaks inline code in titles
-msgspec
-snowballstemmer<3  # https://github.com/snowballstem/snowball/issues/229
-commonmark # Required by sphinx-argparse when using :markdownhelp:
-
-# Custom autodoc2 is necessary for faster docstring processing
-# see: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33#issuecomment-2856386035
-git+https://github.com/hmellor/sphinx-autodoc2.git # sphinx-autodoc2==0.5.0
-
-# packages to install to build the documentation
-cachetools
-f https://download.pytorch.org/whl/cpu
-torch
\ No newline at end of file
+mkdocs
+mkdocs-api-autonav
+mkdocs-material
+mkdocstrings-python
+mkdocs-gen-files
+mkdocs-awesome-nav
+python-markdown-math
+regex
+ruff
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt