Merge tag 'v0.8.4' into v0.8.4-dev

31330101 · zhuwenwen · e8933c34 · dc1b4a6f · 31330101 · 31330101
Commit 31330101 authored Apr 16, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
@@ -22,31 +22,40 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-# Create an LLM.
-llm = LLM(
-    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    max_num_seqs=8,
-    # The max_model_len and block_size arguments are required to be same as
-    # max sequence length when targeting neuron device.
-    # Currently, this is a known limitation in continuous batching support
-    # in transformers-neuronx.
-    # TODO(liangfu): Support paged-attention in transformers-neuronx.
-    max_model_len=2048,
-    block_size=2048,
-    # The device can be automatically detected when AWS Neuron SDK is installed.
-    # The device argument can be either unspecified for automated detection,
-    # or explicitly assigned.
-    device="neuron",
-    quantization="neuron_quant",
-    override_neuron_config={
-        "cast_logits_dtype": "bfloat16",
-    },
-    tensor_parallel_size=2)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        max_num_seqs=8,
+        # The max_model_len and block_size arguments are required to be same as
+        # max sequence length when targeting neuron device.
+        # Currently, this is a known limitation in continuous batching support
+        # in transformers-neuronx.
+        # TODO(liangfu): Support paged-attention in transformers-neuronx.
+        max_model_len=2048,
+        block_size=2048,
+        # ruff: noqa: E501
+        # The device can be automatically detected when AWS Neuron SDK is installed.
+        # The device argument can be either unspecified for automated detection,
+        # or explicitly assigned.
+        device="neuron",
+        quantization="neuron_quant",
+        override_neuron_config={
+            "cast_logits_dtype": "bfloat16",
+        },
+        tensor_parallel_size=2)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/prefix_caching.py
+++ b/examples/offline_inference/prefix_caching.py
@@ -31,55 +31,62 @@ generating_prompts = [prefix + prompt for prompt in prompts]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.0)

-# Create an LLM without prefix caching as a baseline.
-regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
-
-print("Results without `enable_prefix_caching`")
-
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = regular_llm.generate(generating_prompts, sampling_params)
-
-regular_generated_texts = []
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    regular_generated_texts.append(generated_text)
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-print("-" * 80)
-
-# Destroy the LLM object and free up the GPU memory.
-del regular_llm
-cleanup_dist_env_and_memory()
-
-# Create an LLM with prefix caching enabled.
-prefix_cached_llm = LLM(model="facebook/opt-125m",
-                        enable_prefix_caching=True,
-                        gpu_memory_utilization=0.4)
-
-# Warmup so that the shared prompt's KV cache is computed.
-prefix_cached_llm.generate(generating_prompts[0], sampling_params)
-
-# Generate with prefix caching.
-outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
-
-print("Results with `enable_prefix_caching`")
-
-cached_generated_texts = []
-# Print the outputs. You should see the same outputs as before.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    cached_generated_texts.append(generated_text)
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-print("-" * 80)
-
-# Compare the results and display the speedup
-generated_same = all([
-    regular_generated_texts[i] == cached_generated_texts[i]
-    for i in range(len(prompts))
-])
-print(f"Generated answers are the same: {generated_same}")
+
+def main():
+    # Create an LLM without prefix caching as a baseline.
+    regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
+
+    print("Results without `enable_prefix_caching`")
+
+    # ruff: noqa: E501
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = regular_llm.generate(generating_prompts, sampling_params)
+
+    regular_generated_texts = []
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        regular_generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Destroy the LLM object and free up the GPU memory.
+    del regular_llm
+    cleanup_dist_env_and_memory()
+
+    # Create an LLM with prefix caching enabled.
+    prefix_cached_llm = LLM(model="facebook/opt-125m",
+                            enable_prefix_caching=True,
+                            gpu_memory_utilization=0.4)
+
+    # Warmup so that the shared prompt's KV cache is computed.
+    prefix_cached_llm.generate(generating_prompts[0], sampling_params)
+
+    # Generate with prefix caching.
+    outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
+
+    print("Results with `enable_prefix_caching`")
+
+    cached_generated_texts = []
+    # Print the outputs. You should see the same outputs as before.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        cached_generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Compare the results and display the speedup
+    generated_same = all([
+        regular_generated_texts[i] == cached_generated_texts[i]
+        for i in range(len(prompts))
+    ])
+    print(f"Generated answers are the same: {generated_same}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -234,9 +234,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
            sampling_params.max_tokens = next(output_len_generator)
            assert isinstance(sampling_params.max_tokens, int)

-            prompt_token_ids = torch.randint(
-                llm.llm_engine.model_config.get_vocab_size(),
-                size=(prompt_len, )).tolist()
+            prompt_token_ids = torch.randint(llm.get_tokenizer().vocab_size,
+                                             size=(prompt_len, )).tolist()

            llm.llm_engine.add_request(
                request_id=f"seq{i}",

--- a/examples/offline_inference/reproduciblity.py
+++ b/examples/offline_inference/reproduciblity.py
@@ -19,8 +19,6 @@ SEED = 42
 # because it is almost impossible to make the scheduling deterministic in the
 # online serving setting.

-llm = LLM(model="facebook/opt-125m", seed=SEED)
-
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
@@ -29,8 +27,17 @@ prompts = [
 ]
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

-outputs = llm.generate(prompts, sampling_params)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+def main():
+    llm = LLM(model="facebook/opt-125m", seed=SEED)
+    outputs = llm.generate(prompts, sampling_params)
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -85,11 +85,13 @@ sampling_params = SamplingParams(temperature=0)

 outputs = ray.get(llm.generate.remote(prompts, sampling_params))

+print("-" * 50)
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, "
+    print(f"Prompt: {prompt!r}\n"
          f"Generated text: {generated_text!r}")
+    print("-" * 50)

 # set up the communication between the training process
 # and the inference engine.
@@ -120,8 +122,10 @@ assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
 # use the updated model to generate texts, they will be nonsense
 # because the weights are all zeros.
 outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
+print("-" * 50)
 for output in outputs_updated:
    prompt = output.prompt
    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, "
+    print(f"Prompt: {prompt!r}\n"
          f"Generated text: {generated_text!r}")
+    print("-" * 50)
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@@ -32,10 +32,12 @@ if __name__ == "__main__":
    llm.stop_profile()

    # Print the outputs.
+    print("-" * 50)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)

    # Add a buffer to wait for profiler in the background process
    # (in case MP is on) to finish writing profiling output.

--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
 # SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of guided decoding 
+to generate structured outputs using vLLM. It shows how to apply 
+different guided decoding techniques such as Choice, Regex, JSON schema, 
+and Grammar to produce structured and formatted results 
+based on specific prompts.
+"""

 from enum import Enum

@@ -7,26 +14,21 @@ from pydantic import BaseModel
 from vllm import LLM, SamplingParams
 from vllm.sampling_params import GuidedDecodingParams

-llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
-
 # Guided decoding by Choice (list of possible options)
-guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-outputs = llm.generate(
-    prompts="Classify this sentiment: vLLM is wonderful!",
-    sampling_params=sampling_params,
-)
-print(outputs[0].outputs[0].text)
+guided_decoding_params_choice = GuidedDecodingParams(
+    choice=["Positive", "Negative"])
+sampling_params_choice = SamplingParams(
+    guided_decoding=guided_decoding_params_choice)
+prompt_choice = "Classify this sentiment: vLLM is wonderful!"

 # Guided decoding by Regex
-guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n")
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params,
-                                 stop=["\n"])
-prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-          "End in .com and new line. Example result:"
-          "alan.turing@enigma.com\n")
-outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
-print(outputs[0].outputs[0].text)
+guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
+sampling_params_regex = SamplingParams(
+    guided_decoding=guided_decoding_params_regex, stop=["\n"])
+prompt_regex = (
+    "Generate an email address for Alan Turing, who works in Enigma."
+    "End in .com and new line. Example result:"
+    "alan.turing@enigma.com\n")


 # Guided decoding by JSON using Pydantic schema
@@ -44,37 +46,54 @@ class CarDescription(BaseModel):


 json_schema = CarDescription.model_json_schema()
-
-guided_decoding_params = GuidedDecodingParams(json=json_schema)
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-prompt = ("Generate a JSON with the brand, model and car_type of"
-          "the most iconic car from the 90's")
-outputs = llm.generate(
-    prompts=prompt,
-    sampling_params=sampling_params,
-)
-print(outputs[0].outputs[0].text)
+guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
+sampling_params_json = SamplingParams(
+    guided_decoding=guided_decoding_params_json)
+prompt_json = ("Generate a JSON with the brand, model and car_type of"
+               "the most iconic car from the 90's")

 # Guided decoding by Grammar
 simplified_sql_grammar = """
-    ?start: select_statement
+root ::= select_statement
+select_statement ::= "SELECT " column " from " table " where " condition
+column ::= "col_1 " | "col_2 "
+table ::= "table_1 " | "table_2 "
+condition ::= column "= " number
+number ::= "1 " | "2 "
+"""
+guided_decoding_params_grammar = GuidedDecodingParams(
+    grammar=simplified_sql_grammar)
+sampling_params_grammar = SamplingParams(
+    guided_decoding=guided_decoding_params_grammar)
+prompt_grammar = ("Generate an SQL query to show the 'username' and 'email'"
+                  "from the 'users' table.")

-    ?select_statement: "SELECT " column_list " FROM " table_name

-    ?column_list: column_name ("," column_name)*
+def format_output(title: str, output: str):
+    print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}")

-    ?table_name: identifier

-    ?column_name: identifier
+def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
+    outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+    return outputs[0].outputs[0].text

-    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
-"""
-guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar)
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-prompt = ("Generate an SQL query to show the 'username' and 'email'"
-          "from the 'users' table.")
-outputs = llm.generate(
-    prompts=prompt,
-    sampling_params=sampling_params,
-)
-print(outputs[0].outputs[0].text)
+
+def main():
+    llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
+
+    choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
+    format_output("Guided decoding by Choice", choice_output)
+
+    regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
+    format_output("Guided decoding by Regex", regex_output)
+
+    json_output = generate_output(prompt_json, sampling_params_json, llm)
+    format_output("Guided decoding by JSON", json_output)
+
+    grammar_output = generate_output(prompt_grammar, sampling_params_grammar,
+                                     llm)
+    format_output("Guided decoding by Grammar", grammar_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -36,11 +36,13 @@ llm = LLM(
 outputs = llm.generate(prompts, sampling_params)

 # all ranks will have the same outputs
+print("-" * 50)
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, "
+    print(f"Prompt: {prompt!r}\n"
          f"Generated text: {generated_text!r}")
+    print("-" * 50)
 """
 Further tips:


--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -16,14 +16,22 @@ N = 1
 # Currently, top-p sampling is disabled. `top_p` should be 1.0.
 sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)

-# Set `enforce_eager=True` to avoid ahead-of-time compilation.
-# In real workloads, `enforace_eager` should be `False`.
-llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
-          max_num_batched_tokens=64,
-          max_num_seqs=4)
-outputs = llm.generate(prompts, sampling_params)
-for output, answer in zip(outputs, answers):
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    assert generated_text.startswith(answer)
+
+def main():
+    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
+    # In real workloads, `enforace_eager` should be `False`.
+    llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+              max_num_batched_tokens=64,
+              max_num_seqs=4)
+    outputs = llm.generate(prompts, sampling_params)
+    print("-" * 50)
+    for output, answer in zip(outputs, answers):
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        assert generated_text.startswith(answer)
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -8,6 +8,7 @@ on HuggingFace model repository.
 """
 import os
 import random
+from contextlib import contextmanager
 from dataclasses import asdict
 from typing import NamedTuple, Optional

@@ -44,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
@@ -70,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"crop_to_patches": True},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )
    prompts = [
        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
@@ -91,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    prompts = [f"Question: {question} Answer:" for question in questions]
    engine_args = EngineArgs(
        model="Salesforce/blip2-opt-6.7b",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
@@ -109,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
        model="facebook/chameleon-7b",
        max_model_len=4096,
        max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
@@ -128,8 +129,8 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+        limit_mm_per_prompt={"image": 1},
    )

    prompts = [
@@ -154,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="bfloat16",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
@@ -174,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
        model="adept/fuyu-8b",
        max_model_len=2048,
        max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
@@ -193,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"do_pan_and_scan": True},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    prompts = [("<bos><start_of_turn>user\n"
@@ -218,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
        trust_remote_code=True,
        enforce_eager=True,
        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    prompts = [
@@ -245,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -286,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
                "longest_edge": 3 * 364
            },
        },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )
    prompts = [(
        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
@@ -298,6 +299,34 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
    )


+# SmolVLM2-2.2B-Instruct
+def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        mm_processor_kwargs={
+            "max_image_size": {
+                "longest_edge": 384
+            },
+        },
+        limit_mm_per_prompt={"image": 1},
+    )
+    prompts = [
+        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # InternVL
 def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -308,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -346,7 +375,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
    engine_args = EngineArgs(
        model="llava-hf/llava-1.5-7b-hf",
        max_model_len=4096,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
@@ -363,7 +392,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
    engine_args = EngineArgs(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        max_model_len=8192,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
@@ -385,7 +414,7 @@ def run_llava_next_video(questions: list[str],
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
        max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
@@ -413,7 +442,7 @@ def run_llava_onevision(questions: list[str],
    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
@@ -436,7 +465,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
        max_model_len=4096,
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )
    stop_token_ids = [128009]

@@ -477,7 +506,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
@@ -532,7 +561,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -556,9 +585,9 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
    # The configuration below has been confirmed to launch on a single L40 GPU.
    engine_args = EngineArgs(
        model=model_name,
-        max_model_len=4096,
+        max_model_len=8192,
        max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -582,7 +611,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
    )


-def run_llama4(questions: list[str], modality: str):
+def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
@@ -592,8 +621,8 @@ def run_llama4(questions: list[str], modality: str):
        max_model_len=8192,
        max_num_seqs=4,
        tensor_parallel_size=8,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
        gpu_memory_utilization=0.4,
+        limit_mm_per_prompt={"image": 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -628,7 +657,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        trust_remote_code=True,
        dtype="bfloat16",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    prompts = [
@@ -654,7 +683,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -681,7 +710,8 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma-3b-mix-224",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        limit_mm_per_prompt={"image": 1},
+    )

    return ModelRequestData(
        engine_args=engine_args,
@@ -697,7 +727,8 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma2-3b-ft-docci-448",
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+        limit_mm_per_prompt={"image": 1},
+    )

    return ModelRequestData(
        engine_args=engine_args,
@@ -733,7 +764,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
        max_num_seqs=2,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"num_crops": 16},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
@@ -764,6 +795,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
+        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
@@ -784,7 +816,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        max_model_len=6144,
        max_num_seqs=2,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -805,7 +837,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
        max_model_len=1024,
        max_num_seqs=2,
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
@@ -830,7 +862,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
        },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    if modality == "image":
@@ -865,7 +897,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    if modality == "image":
@@ -896,7 +928,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
-        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        limit_mm_per_prompt={"image": 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -955,6 +987,7 @@ model_example_map = {
    "qwen2_vl": run_qwen2_vl,
    "qwen2_5_vl": run_qwen2_5_vl,
    "skywork_chat": run_skyworkr1v,
+    "smolvlm": run_smolvlm,
 }


@@ -1026,6 +1059,20 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data,
    return inputs


+@contextmanager
+def time_counter(enable: bool):
+    if enable:
+        import time
+        start_time = time.time()
+        yield
+        elapsed_time = time.time() - start_time
+        print("-" * 50)
+        print("-- generate time = {}".format(elapsed_time))
+        print("-" * 50)
+    else:
+        yield
+
+
 def main(args):
    model = args.model_type
    if model not in model_example_map:
@@ -1038,15 +1085,16 @@ def main(args):

    req_data = model_example_map[model](questions, modality)

-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
-    llm = LLM(**engine_args)
+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})

-    # To maintain code compatibility in this script, we add LoRA here.
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
-    if req_data.lora_requests:
-        for lora_request in req_data.lora_requests:
-            llm.llm_engine.add_lora(lora_request=lora_request)
+    engine_args = asdict(req_data.engine_args) | {
+        "seed": args.seed,
+        "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
+    }
+    llm = LLM(**engine_args)

    # Don't want to check the flag multiple times, so just hijack `prompts`.
    prompts = req_data.prompts if args.use_different_prompt_per_request else [
@@ -1084,19 +1132,22 @@ def main(args):
                },
            } for i in range(args.num_prompts)]

-    if args.time_generate:
-        import time
-        start_time = time.time()
-        outputs = llm.generate(inputs, sampling_params=sampling_params)
-        elapsed_time = time.time() - start_time
-        print("-- generate time = {}".format(elapsed_time))
+    # Add LoRA request if applicable
+    lora_request = (req_data.lora_requests *
+                    args.num_prompts if req_data.lora_requests else None)

-    else:
-        outputs = llm.generate(inputs, sampling_params=sampling_params)
+    with time_counter(args.time_generate):
+        outputs = llm.generate(
+            inputs,
+            sampling_params=sampling_params,
+            lora_request=lora_request,
+        )

+    print("-" * 50)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
+        print("-" * 50)


 if __name__ == "__main__":

--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -63,6 +63,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
        model="royokong/e5-v",
        task="embed",
        max_model_len=4096,
+        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
@@ -93,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
        task="embed",
        trust_remote_code=True,
        mm_processor_kwargs={"num_crops": 4},
+        limit_mm_per_prompt={"image": 1},
    )

    return ModelRequestData(
@@ -131,6 +133,11 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
    query = get_query(modality)
    req_data = model_example_map[model](query)

+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})
+
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

@@ -143,8 +150,10 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
        "multi_modal_data": mm_data,
    })

+    print("-" * 50)
    for output in outputs:
        print(output.outputs.embedding)
+        print("-" * 50)


 def main(args: Namespace):

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -22,6 +22,16 @@ QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
 ]


@@ -217,6 +227,33 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={
+            "max_image_size": {
+                "longest_edge": 384
+            },
+        },
+    )
+
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL2-2B"

@@ -258,8 +295,7 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:

    engine_args = EngineArgs(
        model=model_name,
-        max_model_len=8192,
-        max_num_seqs=4,
+        max_model_len=131072,
        tensor_parallel_size=8,
        limit_mm_per_prompt={"image": len(image_urls)},
    )
@@ -318,8 +354,8 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
    # The configuration below has been confirmed to launch on a single L40 GPU.
    engine_args = EngineArgs(
        model=model_name,
-        max_model_len=4096,
-        max_num_seqs=16,
+        max_model_len=8192,
+        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

@@ -614,6 +650,7 @@ model_example_map = {
    "qwen_vl_chat": load_qwen_vl_chat,
    "qwen2_vl": load_qwen2_vl,
    "qwen2_5_vl": load_qwen2_5_vl,
+    "smolvlm": load_smolvlm,
 }


@@ -624,15 +661,8 @@ def run_generate(model, question: str, image_urls: list[str],
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)

-    # To maintain code compatibility in this script, we add LoRA here.
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
-    if req_data.lora_requests:
-        for lora_request in req_data.lora_requests:
-            llm.llm_engine.add_lora(lora_request=lora_request)
-
    sampling_params = SamplingParams(temperature=0.0,
-                                     max_tokens=128,
+                                     max_tokens=256,
                                     stop_token_ids=req_data.stop_token_ids)

    outputs = llm.generate(
@@ -642,29 +672,31 @@ def run_generate(model, question: str, image_urls: list[str],
                "image": req_data.image_data
            },
        },
-        sampling_params=sampling_params)
+        sampling_params=sampling_params,
+        lora_request=req_data.lora_requests,
+    )

+    print("-" * 50)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
+        print("-" * 50)


 def run_chat(model: str, question: str, image_urls: list[str],
             seed: Optional[int]):
    req_data = model_example_map[model](question, image_urls)

+    # Disable other modalities to save memory
+    default_limits = {"image": 0, "video": 0, "audio": 0}
+    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
+        req_data.engine_args.limit_mm_per_prompt or {})
+
    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    llm = LLM(**engine_args)

-    # To maintain code compatibility in this script, we add LoRA here.
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
-    if req_data.lora_requests:
-        for lora_request in req_data.lora_requests:
-            llm.llm_engine.add_lora(lora_request=lora_request)
-
    sampling_params = SamplingParams(temperature=0.0,
-                                     max_tokens=128,
+                                     max_tokens=256,
                                     stop_token_ids=req_data.stop_token_ids)
    outputs = llm.chat(
        [{
@@ -685,11 +717,14 @@ def run_chat(model: str, question: str, image_urls: list[str],
        }],
        sampling_params=sampling_params,
        chat_template=req_data.chat_template,
+        lora_request=req_data.lora_requests,
    )

+    print("-" * 50)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
+        print("-" * 50)


 def main(args: Namespace):
@@ -697,10 +732,12 @@ def main(args: Namespace):
    method = args.method
    seed = args.seed

+    image_urls = IMAGE_URLS[:args.num_images]
+
    if method == "generate":
-        run_generate(model, QUESTION, IMAGE_URLS, seed)
+        run_generate(model, QUESTION, image_urls, seed)
    elif method == "chat":
-        run_chat(model, QUESTION, IMAGE_URLS, seed)
+        run_chat(model, QUESTION, image_urls, seed)
    else:
        raise ValueError(f"Invalid method: {method}")

@@ -725,6 +762,12 @@ if __name__ == "__main__":
                        type=int,
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
+    parser.add_argument(
+        "--num-images",
+        "-n",
+        choices=list(range(1, 13)),  # 12 is the max number of images
+        default=2,
+        help="Number of images to use for the demo.")

    args = parser.parse_args()
    main(args)
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
 # SPDX-License-Identifier: Apache-2.0
 """Example Python client for `vllm.entrypoints.api_server`
+Start the demo server:
+    python -m vllm.entrypoints.api_server --model <model_name>
+
 NOTE: The API server is used only for demonstration and simple performance
 benchmarks. It is not intended for production use.
 For production use, we recommend `vllm serve` and the OpenAI client API.
@@ -7,6 +10,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.

 import argparse
 import json
+from argparse import Namespace
 from collections.abc import Iterable

 import requests
@@ -27,7 +31,6 @@ def post_http_request(prompt: str,
    pload = {
        "prompt": prompt,
        "n": n,
-        "use_beam_search": True,
        "temperature": 0.0,
        "max_tokens": 16,
        "stream": stream,
@@ -55,14 +58,7 @@ def get_response(response: requests.Response) -> list[str]:
    return output


-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--n", type=int, default=4)
-    parser.add_argument("--prompt", type=str, default="San Francisco is a")
-    parser.add_argument("--stream", action="store_true")
-    args = parser.parse_args()
+def main(args: Namespace):
    prompt = args.prompt
    api_url = f"http://{args.host}:{args.port}/generate"
    n = args.n
@@ -83,3 +79,14 @@ if __name__ == "__main__":
        output = get_response(response)
        for i, line in enumerate(output):
            print(f"Beam candidate {i}: {line!r}", flush=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--n", type=int, default=1)
+    parser.add_argument("--prompt", type=str, default="San Francisco is a")
+    parser.add_argument("--stream", action="store_true")
+    args = parser.parse_args()
+    main(args)
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -23,7 +23,7 @@ def sync_openai():
    with open(str(mary_had_lamb), "rb") as f:
        transcription = client.audio.transcriptions.create(
            file=f,
-            model="openai/whisper-small",
+            model="openai/whisper-large-v3",
            language="en",
            response_format="json",
            temperature=0.0)

--- a/examples/template_florence2.jinja
+++ b/examples/template_florence2.jinja
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- message['content'] -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- message['content'] -}}
+    {%- endif -%}
+{%- endfor -%}
--- a/examples/tool_chat_template_llama3.2_pythonic.jinja
+++ b/examples/tool_chat_template_llama3.2_pythonic.jinja
@@ -76,7 +76,7 @@
            {{- tool_call.name + '(' -}}
            {%- for param in tool_call.arguments %}
                {{- param + '=' -}}
-                {{- "%sr" | format(tool_call.arguments[param]) -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
                {% if not loop.last %}, {% endif %}
            {%- endfor %}
            {{- ')' -}}

--- a/examples/tool_chat_template_llama4_pythonic.jinja
+++ b/examples/tool_chat_template_llama4_pythonic.jinja
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = false %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content']|trim %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- if tools is not none %}
+        {#- Add default tool system message when tools are provided #}
+        {%- set system_message = "You are a helpful assistant with tool calling "
+            "capabilities. Only reply with a tool call if the function exists in the "
+            "library provided by the user. If it doesn't exist, just reply directly in "
+            "natural language. When you receive a tool call response, use the output to "
+            "format an answer to the original user question." %}
+    {%- else %}
+        {%- set system_message = "" %}
+    {%- endif %}
+{%- endif %}
+
+{#- System message if the user supplied one, or if tools are used (default tool system message) #}
+{%- if system_message %}
+    {#- always use user provided system message to override default tool system message #}
+    {{- "<|header_start|>system<|header_end|>\n\n" }}
+    {{- system_message }}
+    {%- if tools is not none and not tools_in_user_message %}
+        {{- "Tools: You have access to the following tools. You might need to use one "
+            "or more function/tool calls to fulfill the task. \n"
+            "If none are needed, then proceed to the response.\n\n"
+            "Tool Call Syntax: You can call tools using the following syntax:\n"
+            "[func_name1(params_name1=params_value1, params_name2=params_value2, ...), ...]\n"
+            "Do not include anything else when calling the tools with the syntax above.\n\n"
+            "Here is a list of functions in JSON format that you can invoke.\n " }}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "\n\n" }}
+        {%- endfor %}
+    {%- endif %}
+    {{- "<|eot|>" }}
+{%- endif %}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and tools is not none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- if messages[0]['content'] is string %}
+            {%- set first_user_message = messages[0]['content']|trim %}
+        {%- else %}
+            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
+        {%- endif %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|header_start|>user<|header_end|>\n\n' -}}
+    {{- first_user_message}}
+    {{- "\nHere is a list of functions in JSON format that you can invoke:"}}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- "Should you decide to return the function call(s), put them in the format "
+        "of [func_name1(params_name1=params_value1, params_name2=params_value2, "
+        "...), ...]\nDo not include anything else when calling the tools with the "
+        "syntax above." }}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+    {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] | trim }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "<|eot|>" }}
+    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {{- '<|header_start|>assistant<|header_end|>\n\n' -}}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- tool_call.name + '(' -}}
+            {%- for param in tool_call.arguments %}
+                {{- param + '=' -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
+                {% if not loop.last %}, {% endif %}
+            {%- endfor %}
+            {{- ')' -}}
+            {% if not loop.last %}, {% endif %}
+        {%- endfor %}
+        {{- "<|eom|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|header_start|>ipython<|header_end|>\n\n" }}
+        {%- if message.content is string %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {%- for content in message['content']  %}
+                {%- if content['type']  == 'text' %}
+                    {{- content['text'] | tojson }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "<|eom|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|header_start|>assistant<|header_end|>\n\n' }}
+{%- endif %}
--- a/examples/tool_chat_template_toolace.jinja
+++ b/examples/tool_chat_template_toolace.jinja
@@ -44,7 +44,7 @@
            {{- tool_call.name + '(' -}}
            {%- for param in tool_call.arguments %}
                {{- param + '=' -}}
-                {{- "%sr" | format(tool_call.arguments[param]) -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
                {% if not loop.last %}, {% endif %}
            {%- endfor %}
            {{- ')' -}}

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,7 @@ classifiers = [
    "Topic :: Scientific/Engineering :: Artificial Intelligence",
    "Topic :: Scientific/Engineering :: Information Analysis",
 ]
-requires-python = ">=3.9"
+requires-python = ">=3.9,<3.13"
 dynamic = [ "version", "dependencies", "optional-dependencies"]

 [project.urls]

--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -6,7 +6,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.51.0
+transformers >= 4.51.1
 huggingface-hub[hf_xet] >= 0.30.0  # Required for Xet downloads.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
@@ -22,13 +22,13 @@ lm-format-enforcer >= 0.10.11, < 0.11
 llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64"
+xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq
 msgspec
-gguf == 0.10.0
+gguf >= 0.13.0
 importlib_metadata
 mistral_common[opencv] >= 1.5.4 # requires numpy>=1.25
 opencv-python-headless >= 4.11.0    # required for video IO
@@ -36,10 +36,14 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.9.2 # required for compressed-tensors
+compressed-tensors == 0.9.3 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
+opentelemetry-sdk>=1.26.0,<1.27.0  # vllm.tracing
+opentelemetry-api>=1.26.0,<1.27.0  # vllm.tracing
+opentelemetry-exporter-otlp>=1.26.0,<1.27.0  # vllm.tracing
+opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0  # vllm.tracing