[doc] Fold long code blocks to improve readability (#19926)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>

[doc] Fold long code blocks to improve readability (#19926)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
f17aec0d · Reid · GitHub · 493c2753 · f17aec0d · f17aec0d
Unverified Commit f17aec0d authored Jun 23, 2025 by Reid Committed by GitHub Jun 23, 2025
20 changed files
--- a/docs/design/kernel/paged_attention.md
+++ b/docs/design/kernel/paged_attention.md
@@ -448,27 +448,29 @@ elements of the entire head for all context tokens. However, overall,
 all results for output have been calculated but are just stored in
 different thread register memory.
-```cpp
+??? Code
-float* out_smem = reinterpret_cast<float*>(shared_mem);
-for (int i = NUM_WARPS; i > 1; i /= 2) {
-    // Upper warps write to shared memory.
-    ...
-    float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
-    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-        ...
-        dst[row_idx] = accs[i];
-    }
-    // Lower warps update the output.
+    ```cpp
-    const float* src = &out_smem[warp_idx * HEAD_SIZE];
+    float* out_smem = reinterpret_cast<float*>(shared_mem);
-    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    for (int i = NUM_WARPS; i > 1; i /= 2) {
+        // Upper warps write to shared memory.
        ...
-        accs[i] += src[row_idx];
+        float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+            ...
+            dst[row_idx] = accs[i];
+        }
+        // Lower warps update the output.
+        const float* src = &out_smem[warp_idx * HEAD_SIZE];
+        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+            ...
+            accs[i] += src[row_idx];
+        }
+        // Write out the accs.
    }
+    ```
-    // Write out the accs.
-}
-```
 ## Output

--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -13,28 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (
 vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
-```python
+??? Code
-# inside `setup.py` file
-from setuptools import setup
+    ```python
+    # inside `setup.py` file
-setup(name='vllm_add_dummy_model',
+    from setuptools import setup
-      version='0.1',
-      packages=['vllm_add_dummy_model'],
+    setup(name='vllm_add_dummy_model',
-      entry_points={
+        version='0.1',
-          'vllm.general_plugins':
+        packages=['vllm_add_dummy_model'],
-          ["register_dummy_model = vllm_add_dummy_model:register"]
+        entry_points={
-      })
+            'vllm.general_plugins':
+            ["register_dummy_model = vllm_add_dummy_model:register"]
-# inside `vllm_add_dummy_model.py` file
+        })
-def register():
-    from vllm import ModelRegistry
+    # inside `vllm_add_dummy_model.py` file
+    def register():
-    if "MyLlava" not in ModelRegistry.get_supported_archs():
+        from vllm import ModelRegistry
-        ModelRegistry.register_model(
-            "MyLlava",
+        if "MyLlava" not in ModelRegistry.get_supported_archs():
-            "vllm_add_dummy_model.my_llava:MyLlava",
+            ModelRegistry.register_model(
-        )
+                "MyLlava",
-```
+                "vllm_add_dummy_model.my_llava:MyLlava",
+            )
+    ```
 For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).

--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
 of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
 the third parameter is the path to the LoRA adapter.
-```python
+??? Code
-sampling_params = SamplingParams(
-    temperature=0,
+    ```python
-    max_tokens=256,
+    sampling_params = SamplingParams(
-    stop=["[/assistant]"]
+        temperature=0,
-)
+        max_tokens=256,
+        stop=["[/assistant]"]
-prompts = [
+    )
-     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
-     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
+    prompts = [
-]
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
-outputs = llm.generate(
+    ]
-    prompts,
-    sampling_params,
+    outputs = llm.generate(
-    lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+        prompts,
-)
+        sampling_params,
-```
+        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+    )
+    ```
 Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
@@ -68,24 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
 etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
 with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
-```bash
+??? Command
-curl localhost:8000/v1/models | jq .
-{
+    ```bash
-    "object": "list",
+    curl localhost:8000/v1/models | jq .
-    "data": [
+    {
-        {
+        "object": "list",
-            "id": "meta-llama/Llama-2-7b-hf",
+        "data": [
-            "object": "model",
+            {
-            ...
+                "id": "meta-llama/Llama-2-7b-hf",
-        },
+                "object": "model",
-        {
+                ...
-            "id": "sql-lora",
+            },
-            "object": "model",
+            {
-            ...
+                "id": "sql-lora",
-        }
+                "object": "model",
-    ]
+                ...
-}
+            }
-```
+        ]
+    }
+    ```
 Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
 processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
@@ -168,36 +172,36 @@ Alternatively, follow these example steps to implement your own plugin:
 1. Implement the LoRAResolver interface.
-    Example of a simple S3 LoRAResolver implementation:
+    ??? Example of a simple S3 LoRAResolver implementation
-    ```python
+        ```python
-    import os
+        import os
-    import s3fs
+        import s3fs
-    from vllm.lora.request import LoRARequest
+        from vllm.lora.request import LoRARequest
-    from vllm.lora.resolver import LoRAResolver
+        from vllm.lora.resolver import LoRAResolver
-    class S3LoRAResolver(LoRAResolver):
+        class S3LoRAResolver(LoRAResolver):
-        def __init__(self):
+            def __init__(self):
-            self.s3 = s3fs.S3FileSystem()
+                self.s3 = s3fs.S3FileSystem()
-            self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
+                self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
-            self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
+                self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
-        async def resolve_lora(self, base_model_name, lora_name):
+            async def resolve_lora(self, base_model_name, lora_name):
-            s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+                s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
-            local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+                local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
-            # Download the LoRA from S3 to the local path
+                # Download the LoRA from S3 to the local path
-            await self.s3._get(
+                await self.s3._get(
-                s3_path, local_path, recursive=True, maxdepth=1
+                    s3_path, local_path, recursive=True, maxdepth=1
-            )
+                )
-            lora_request = LoRARequest(
+                lora_request = LoRARequest(
-                lora_name=lora_name,
+                    lora_name=lora_name,
-                lora_path=local_path,
+                    lora_path=local_path,
-                lora_int_id=abs(hash(lora_name))
+                    lora_int_id=abs(hash(lora_name))
-            )
+                )
-            return lora_request
+                return lora_request
-    ```
+        ```
 2. Register `LoRAResolver` plugin.
@@ -234,38 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
 - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
 - The `root` field points to the artifact location of the lora adapter.
-```bash
+??? Command output
-$ curl http://localhost:8000/v1/models
+    ```bash
-{
+    $ curl http://localhost:8000/v1/models
-    "object": "list",
-    "data": [
+    {
-        {
+        "object": "list",
-        "id": "meta-llama/Llama-2-7b-hf",
+        "data": [
-        "object": "model",
-        "created": 1715644056,
-        "owned_by": "vllm",
-        "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
-        "parent": null,
-        "permission": [
            {
-            .....
+            "id": "meta-llama/Llama-2-7b-hf",
-            }
+            "object": "model",
-        ]
+            "created": 1715644056,
-        },
+            "owned_by": "vllm",
-        {
+            "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
-        "id": "sql-lora",
+            "parent": null,
-        "object": "model",
+            "permission": [
-        "created": 1715644056,
+                {
-        "owned_by": "vllm",
+                .....
-        "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
+                }
-        "parent": meta-llama/Llama-2-7b-hf,
+            ]
-        "permission": [
+            },
            {
-            ....
+            "id": "sql-lora",
+            "object": "model",
+            "created": 1715644056,
+            "owned_by": "vllm",
+            "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
+            "parent": meta-llama/Llama-2-7b-hf,
+            "permission": [
+                {
+                ....
+                }
+            ]
            }
        ]
-        }
+    }
-    ]
+    ```
-}
-```
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -20,111 +20,117 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
-```python
+??? Code
-from vllm import LLM
-llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-# Refer to the HuggingFace repo for the correct format to use
-prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
-# Load the image using PIL.Image
-image = PIL.Image.open(...)
-# Single prompt inference
-outputs = llm.generate({
-    "prompt": prompt,
-    "multi_modal_data": {"image": image},
-})
-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-# Batch inference
-image_1 = PIL.Image.open(...)
-image_2 = PIL.Image.open(...)
-outputs = llm.generate(
-    [
-        {
-            "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
-            "multi_modal_data": {"image": image_1},
-        },
-        {
-            "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
-            "multi_modal_data": {"image": image_2},
-        }
-    ]
-)
-for o in outputs:
+    ```python
-    generated_text = o.outputs[0].text
+    from vllm import LLM
-    print(generated_text)
-```
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+    # Load the image using PIL.Image
+    image = PIL.Image.open(...)
+    # Single prompt inference
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": image},
+    })
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    # Batch inference
+    image_1 = PIL.Image.open(...)
+    image_2 = PIL.Image.open(...)
+    outputs = llm.generate(
+        [
+            {
+                "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
+                "multi_modal_data": {"image": image_1},
+            },
+            {
+                "prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
+                "multi_modal_data": {"image": image_2},
+            }
+        ]
+    )
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
 Full example: <gh-file:examples/offline_inference/vision_language.py>
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
-```python
+??? Code
-from vllm import LLM
+    ```python
-llm = LLM(
+    from vllm import LLM
-    model="microsoft/Phi-3.5-vision-instruct",
-    trust_remote_code=True,  # Required to load Phi-3.5-vision
+    llm = LLM(
-    max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
+        model="microsoft/Phi-3.5-vision-instruct",
-    limit_mm_per_prompt={"image": 2},  # The maximum number to accept
+        trust_remote_code=True,  # Required to load Phi-3.5-vision
-)
+        max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
+        limit_mm_per_prompt={"image": 2},  # The maximum number to accept
-# Refer to the HuggingFace repo for the correct format to use
+    )
-prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
+    # Refer to the HuggingFace repo for the correct format to use
-# Load the images using PIL.Image
+    prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
-image1 = PIL.Image.open(...)
-image2 = PIL.Image.open(...)
+    # Load the images using PIL.Image
+    image1 = PIL.Image.open(...)
-outputs = llm.generate({
+    image2 = PIL.Image.open(...)
-    "prompt": prompt,
-    "multi_modal_data": {
+    outputs = llm.generate({
-        "image": [image1, image2]
+        "prompt": prompt,
-    },
+        "multi_modal_data": {
-})
+            "image": [image1, image2]
+        },
-for o in outputs:
+    })
-    generated_text = o.outputs[0].text
-    print(generated_text)
+    for o in outputs:
-```
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
 Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
-```python
+??? Code
-from vllm import LLM
-# Specify the maximum number of frames per video to be 4. This can be changed.
+    ```python
-llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+    from vllm import LLM
-# Create the request payload.
+    # Specify the maximum number of frames per video to be 4. This can be changed.
-video_frames = ... # load your video making sure it only has the number of frames specified earlier.
+    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
-message = {
-    "role": "user",
+    # Create the request payload.
-    "content": [
+    video_frames = ... # load your video making sure it only has the number of frames specified earlier.
-        {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
+    message = {
-    ],
+        "role": "user",
-}
+        "content": [
-for i in range(len(video_frames)):
+            {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
-    base64_image = encode_image(video_frames[i]) # base64 encoding.
+        ],
-    new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+    }
-    message["content"].append(new_image)
+    for i in range(len(video_frames)):
+        base64_image = encode_image(video_frames[i]) # base64 encoding.
-# Perform inference and log output.
+        new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
-outputs = llm.chat([message])
+        message["content"].append(new_image)
-for o in outputs:
+    # Perform inference and log output.
-    generated_text = o.outputs[0].text
+    outputs = llm.chat([message])
-    print(generated_text)
-```
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
 ### Video Inputs
@@ -144,68 +150,72 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
 pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
-```python
+??? Code
-from vllm import LLM
-# Inference with image embeddings as input
+    ```python
-llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+    from vllm import LLM
-# Refer to the HuggingFace repo for the correct format to use
+    # Inference with image embeddings as input
-prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-# Embeddings for single image
+    # Refer to the HuggingFace repo for the correct format to use
-# torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
-image_embeds = torch.load(...)
-outputs = llm.generate({
+    # Embeddings for single image
-    "prompt": prompt,
+    # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
-    "multi_modal_data": {"image": image_embeds},
+    image_embeds = torch.load(...)
-})
-for o in outputs:
+    outputs = llm.generate({
-    generated_text = o.outputs[0].text
+        "prompt": prompt,
-    print(generated_text)
+        "multi_modal_data": {"image": image_embeds},
-```
+    })
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
 For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
-```python
+??? Code
-# Construct the prompt based on your model
-prompt = ...
+    ```python
+    # Construct the prompt based on your model
-# Embeddings for multiple images
+    prompt = ...
-# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
-image_embeds = torch.load(...)
+    # Embeddings for multiple images
+    # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
-# Qwen2-VL
+    image_embeds = torch.load(...)
-llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
-mm_data = {
+    # Qwen2-VL
-    "image": {
+    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
-        "image_embeds": image_embeds,
+    mm_data = {
-        # image_grid_thw is needed to calculate positional encoding.
+        "image": {
-        "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
+            "image_embeds": image_embeds,
+            # image_grid_thw is needed to calculate positional encoding.
+            "image_grid_thw": torch.load(...),  # torch.Tensor of shape (1, 3),
+        }
    }
-}
+    # MiniCPM-V
-# MiniCPM-V
+    llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
-llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
+    mm_data = {
-mm_data = {
+        "image": {
-    "image": {
+            "image_embeds": image_embeds,
-        "image_embeds": image_embeds,
+            # image_sizes is needed to calculate details of the sliced image.
-        # image_sizes is needed to calculate details of the sliced image.
+            "image_sizes": [image.size for image in images],  # list of image sizes
-        "image_sizes": [image.size for image in images],  # list of image sizes
+        }
    }
-}
-outputs = llm.generate({
+    outputs = llm.generate({
-    "prompt": prompt,
+        "prompt": prompt,
-    "multi_modal_data": mm_data,
+        "multi_modal_data": mm_data,
-})
+    })
-for o in outputs:
+    for o in outputs:
-    generated_text = o.outputs[0].text
+        generated_text = o.outputs[0].text
-    print(generated_text)
+        print(generated_text)
-```
+    ```
 ## Online Serving
@@ -235,51 +245,53 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
 Then, you can use the OpenAI client as follows:
-```python
+??? Code
-from openai import OpenAI
+    ```python
-openai_api_key = "EMPTY"
+    from openai import OpenAI
-openai_api_base = "http://localhost:8000/v1"
+    openai_api_key = "EMPTY"
-client = OpenAI(
+    openai_api_base = "http://localhost:8000/v1"
-    api_key=openai_api_key,
-    base_url=openai_api_base,
+    client = OpenAI(
-)
+        api_key=openai_api_key,
+        base_url=openai_api_base,
-# Single-image input inference
+    )
-image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    # Single-image input inference
-chat_response = client.chat.completions.create(
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-    model="microsoft/Phi-3.5-vision-instruct",
-    messages=[{
+    chat_response = client.chat.completions.create(
-        "role": "user",
+        model="microsoft/Phi-3.5-vision-instruct",
-        "content": [
+        messages=[{
-            # NOTE: The prompt formatting with the image token `<image>` is not needed
+            "role": "user",
-            # since the prompt will be processed automatically by the API server.
+            "content": [
-            {"type": "text", "text": "What’s in this image?"},
+                # NOTE: The prompt formatting with the image token `<image>` is not needed
-            {"type": "image_url", "image_url": {"url": image_url}},
+                # since the prompt will be processed automatically by the API server.
-        ],
+                {"type": "text", "text": "What’s in this image?"},
-    }],
+                {"type": "image_url", "image_url": {"url": image_url}},
-)
+            ],
-print("Chat completion output:", chat_response.choices[0].message.content)
+        }],
+    )
-# Multi-image input inference
+    print("Chat completion output:", chat_response.choices[0].message.content)
-image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
-image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+    # Multi-image input inference
+    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
-chat_response = client.chat.completions.create(
+    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
-    model="microsoft/Phi-3.5-vision-instruct",
-    messages=[{
+    chat_response = client.chat.completions.create(
-        "role": "user",
+        model="microsoft/Phi-3.5-vision-instruct",
-        "content": [
+        messages=[{
-            {"type": "text", "text": "What are the animals in these images?"},
+            "role": "user",
-            {"type": "image_url", "image_url": {"url": image_url_duck}},
+            "content": [
-            {"type": "image_url", "image_url": {"url": image_url_lion}},
+                {"type": "text", "text": "What are the animals in these images?"},
-        ],
+                {"type": "image_url", "image_url": {"url": image_url_duck}},
-    }],
+                {"type": "image_url", "image_url": {"url": image_url_lion}},
-)
+            ],
-print("Chat completion output:", chat_response.choices[0].message.content)
+        }],
-```
+    )
+    print("Chat completion output:", chat_response.choices[0].message.content)
+    ```
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
@@ -311,44 +323,46 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model
 Then, you can use the OpenAI client as follows:
-```python
+??? Code
-from openai import OpenAI
-openai_api_key = "EMPTY"
+    ```python
-openai_api_base = "http://localhost:8000/v1"
+    from openai import OpenAI
-client = OpenAI(
+    openai_api_key = "EMPTY"
-    api_key=openai_api_key,
+    openai_api_base = "http://localhost:8000/v1"
-    base_url=openai_api_base,
-)
-video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
-## Use video url in the payload
+    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
-chat_completion_from_url = client.chat.completions.create(
-    messages=[{
-        "role":
-        "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What's in this video?"
-            },
-            {
-                "type": "video_url",
-                "video_url": {
-                    "url": video_url
-                },
-            },
-        ],
-    }],
-    model=model,
-    max_completion_tokens=64,
-)
-result = chat_completion_from_url.choices[0].message.content
+    ## Use video url in the payload
-print("Chat completion output from image url:", result)
+    chat_completion_from_url = client.chat.completions.create(
-```
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this video?"
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": video_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+    ```
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
@@ -373,84 +387,88 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
 Then, you can use the OpenAI client as follows:
-```python
+??? Code
-import base64
-import requests
-from openai import OpenAI
-from vllm.assets.audio import AudioAsset
-def encode_base64_content_from_url(content_url: str) -> str:
+    ```python
-    """Encode a content retrieved from a remote url to base64 format."""
+    import base64
+    import requests
+    from openai import OpenAI
+    from vllm.assets.audio import AudioAsset
-    with requests.get(content_url) as response:
+    def encode_base64_content_from_url(content_url: str) -> str:
-        response.raise_for_status()
+        """Encode a content retrieved from a remote url to base64 format."""
-        result = base64.b64encode(response.content).decode('utf-8')
-    return result
+        with requests.get(content_url) as response:
+            response.raise_for_status()
+            result = base64.b64encode(response.content).decode('utf-8')
-openai_api_key = "EMPTY"
+        return result
-openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
+    openai_api_key = "EMPTY"
-    api_key=openai_api_key,
+    openai_api_base = "http://localhost:8000/v1"
-    base_url=openai_api_base,
-)
-# Any format supported by librosa is supported
+    client = OpenAI(
-audio_url = AudioAsset("winning_call").url
+        api_key=openai_api_key,
-audio_base64 = encode_base64_content_from_url(audio_url)
+        base_url=openai_api_base,
+    )
-chat_completion_from_base64 = client.chat.completions.create(
+    # Any format supported by librosa is supported
-    messages=[{
+    audio_url = AudioAsset("winning_call").url
-        "role": "user",
+    audio_base64 = encode_base64_content_from_url(audio_url)
-        "content": [
-            {
-                "type": "text",
-                "text": "What's in this audio?"
-            },
-            {
-                "type": "input_audio",
-                "input_audio": {
-                    "data": audio_base64,
-                    "format": "wav"
-                },
-            },
-        ],
-    }],
-    model=model,
-    max_completion_tokens=64,
-)
-result = chat_completion_from_base64.choices[0].message.content
+    chat_completion_from_base64 = client.chat.completions.create(
-print("Chat completion output from input audio:", result)
+        messages=[{
-```
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+    ```
 Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
-```python
+??? Code
-chat_completion_from_url = client.chat.completions.create(
-    messages=[{
-        "role": "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What's in this audio?"
-            },
-            {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
-                },
-            },
-        ],
-    }],
-    model=model,
-    max_completion_tokens=64,
-)
-result = chat_completion_from_url.choices[0].message.content
+    ```python
-print("Chat completion output from audio url:", result)
+    chat_completion_from_url = client.chat.completions.create(
-```
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": audio_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
+    ```
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
@@ -470,61 +488,63 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary.
 For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
 The following example demonstrates how to pass image embeddings to the OpenAI server:
-```python
+??? Code
-image_embedding = torch.load(...)
-grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
+    ```python
+    image_embedding = torch.load(...)
-buffer = io.BytesIO()
+    grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
-torch.save(image_embedding, buffer)
-buffer.seek(0)
+    buffer = io.BytesIO()
-binary_data = buffer.read()
+    torch.save(image_embedding, buffer)
-base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+    buffer.seek(0)
+    binary_data = buffer.read()
-client = OpenAI(
+    base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
+    client = OpenAI(
-    base_url=openai_api_base,
+        # defaults to os.environ.get("OPENAI_API_KEY")
-)
+        api_key=openai_api_key,
+        base_url=openai_api_base,
-# Basic usage - this is equivalent to the LLaVA example for offline inference
+    )
-model = "llava-hf/llava-1.5-7b-hf"
-embeds =  {
+    # Basic usage - this is equivalent to the LLaVA example for offline inference
-    "type": "image_embeds",
+    model = "llava-hf/llava-1.5-7b-hf"
-    "image_embeds": f"{base64_image_embedding}" 
+    embeds =  {
-}
+        "type": "image_embeds",
+        "image_embeds": f"{base64_image_embedding}" 
-# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
+    }
-model = "Qwen/Qwen2-VL-2B-Instruct"
-embeds =  {
+    # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
-    "type": "image_embeds",
+    model = "Qwen/Qwen2-VL-2B-Instruct"
-    "image_embeds": {
+    embeds =  {
-        "image_embeds": f"{base64_image_embedding}" , # Required
+        "type": "image_embeds",
-        "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
+        "image_embeds": {
-    },
+            "image_embeds": f"{base64_image_embedding}" , # Required
-}
+            "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
-model = "openbmb/MiniCPM-V-2_6"
-embeds =  {
-    "type": "image_embeds",
-    "image_embeds": {
-        "image_embeds": f"{base64_image_embedding}" , # Required
-        "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
-    },
-}
-chat_completion = client.chat.completions.create(
-    messages=[
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": [
-        {
-            "type": "text",
-            "text": "What's in this image?",
        },
-        embeds,
+    }
-        ],
+    model = "openbmb/MiniCPM-V-2_6"
-    },
+    embeds =  {
-],
+        "type": "image_embeds",
-    model=model,
+        "image_embeds": {
-)
+            "image_embeds": f"{base64_image_embedding}" , # Required
-```
+            "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
+        },
+    }
+    chat_completion = client.chat.completions.create(
+        messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": [
+            {
+                "type": "text",
+                "text": "What's in this image?",
+            },
+            embeds,
+            ],
+        },
+    ],
+        model=model,
+    )
+    ```
 !!! note
    Only one message can contain `{"type": "image_embeds"}`.

--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@@ -15,29 +15,31 @@ pip install autoawq
 After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
-```python
+??? Code
-from awq import AutoAWQForCausalLM
-from transformers import AutoTokenizer
-model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
+    ```python
-quant_path = 'mistral-instruct-v0.2-awq'
+    from awq import AutoAWQForCausalLM
-quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+    from transformers import AutoTokenizer
-# Load model
+    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-model = AutoAWQForCausalLM.from_pretrained(
+    quant_path = 'mistral-instruct-v0.2-awq'
-    model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
-)
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-# Quantize
+    # Load model
-model.quantize(tokenizer, quant_config=quant_config)
+    model = AutoAWQForCausalLM.from_pretrained(
+        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-# Save quantized model
+    # Quantize
-model.save_quantized(quant_path)
+    model.quantize(tokenizer, quant_config=quant_config)
-tokenizer.save_pretrained(quant_path)
-print(f'Model is quantized and saved at "{quant_path}"')
+    # Save quantized model
-```
+    model.save_quantized(quant_path)
+    tokenizer.save_pretrained(quant_path)
+    print(f'Model is quantized and saved at "{quant_path}"')
+    ```
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
@@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \
 AWQ models are also supported directly through the LLM entrypoint:
-```python
+??? Code
-from vllm import LLM, SamplingParams
+    ```python
-# Sample prompts.
+    from vllm import LLM, SamplingParams
-prompts = [
-    "Hello, my name is",
+    # Sample prompts.
-    "The president of the United States is",
+    prompts = [
-    "The capital of France is",
+        "Hello, my name is",
-    "The future of AI is",
+        "The president of the United States is",
-]
+        "The capital of France is",
-# Create a sampling params object.
+        "The future of AI is",
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ]
+    # Create a sampling params object.
-# Create an LLM.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
+    # Create an LLM.
-# that contain the prompt, generated text, and other information.
+    llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
-outputs = llm.generate(prompts, sampling_params)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
-# Print the outputs.
+    # that contain the prompt, generated text, and other information.
-for output in outputs:
+    outputs = llm.generate(prompts, sampling_params)
-    prompt = output.prompt
+    # Print the outputs.
-    generated_text = output.outputs[0].text
+    for output in outputs:
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        prompt = output.prompt
-```
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -43,17 +43,19 @@ llm = LLM(
 ## Read gptq format checkpoint
-```python
+??? Code
-from vllm import LLM
-import torch
+    ```python
+    from vllm import LLM
-# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
+    import torch
-model_id = "hxbgsyxh/llama-13b-4bit-g-1"
-llm = LLM(
+    # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
-    model=model_id,
+    model_id = "hxbgsyxh/llama-13b-4bit-g-1"
-    dtype=torch.float16,
+    llm = LLM(
-    trust_remote_code=True,
+        model=model_id,
-    quantization="bitblas",
+        dtype=torch.float16,
-    max_model_len=1024
+        trust_remote_code=True,
-)
+        quantization="bitblas",
-```
+        max_model_len=1024
+    )
+    ```
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -58,22 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r
 Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
-```python
+??? Code
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-# Configure the simple PTQ quantization
+    ```python
-recipe = QuantizationModifier(
+    from llmcompressor.transformers import oneshot
-  targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+    from llmcompressor.modifiers.quantization import QuantizationModifier
-# Apply the quantization algorithm.
+    # Configure the simple PTQ quantization
-oneshot(model=model, recipe=recipe)
+    recipe = QuantizationModifier(
+      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
-# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
+    # Apply the quantization algorithm.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+    oneshot(model=model, recipe=recipe)
-model.save_pretrained(SAVE_DIR)
-tokenizer.save_pretrained(SAVE_DIR)
+    # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
-```
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+    model.save_pretrained(SAVE_DIR)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```
 ### 3. Evaluating Accuracy

--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -41,42 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
 You can also use the GGUF model directly through the LLM entrypoint:
-```python
+??? Code
-from vllm import LLM, SamplingParams
+      ```python
-# In this script, we demonstrate how to pass input to the chat method:
+      from vllm import LLM, SamplingParams
-conversation = [
-   {
+      # In this script, we demonstrate how to pass input to the chat method:
-      "role": "system",
+      conversation = [
-      "content": "You are a helpful assistant"
+         {
-   },
+            "role": "system",
-   {
+            "content": "You are a helpful assistant"
-      "role": "user",
+         },
-      "content": "Hello"
+         {
-   },
+            "role": "user",
-   {
+            "content": "Hello"
-      "role": "assistant",
+         },
-      "content": "Hello! How can I assist you today?"
+         {
-   },
+            "role": "assistant",
-   {
+            "content": "Hello! How can I assist you today?"
-      "role": "user",
+         },
-      "content": "Write an essay about the importance of higher education.",
+         {
-   },
+            "role": "user",
-]
+            "content": "Write an essay about the importance of higher education.",
+         },
-# Create a sampling params object.
+      ]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+      # Create a sampling params object.
-# Create an LLM.
+      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+      # Create an LLM.
-# Generate texts from the prompts. The output is a list of RequestOutput objects
+      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-# that contain the prompt, generated text, and other information.
+               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-outputs = llm.chat(conversation, sampling_params)
+      # Generate texts from the prompts. The output is a list of RequestOutput objects
+      # that contain the prompt, generated text, and other information.
-# Print the outputs.
+      outputs = llm.chat(conversation, sampling_params)
-for output in outputs:
-   prompt = output.prompt
+      # Print the outputs.
-   generated_text = output.outputs[0].text
+      for output in outputs:
-   print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+         prompt = output.prompt
-```
+         generated_text = output.outputs[0].text
+         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+      ```
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
 Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
-```python
+??? Code
-from datasets import load_dataset
-from gptqmodel import GPTQModel, QuantizeConfig
-model_id = "meta-llama/Llama-3.2-1B-Instruct"
+    ```python
-quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
+    from datasets import load_dataset
+    from gptqmodel import GPTQModel, QuantizeConfig
-calibration_dataset = load_dataset(
+    model_id = "meta-llama/Llama-3.2-1B-Instruct"
-    "allenai/c4",
+    quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
-    data_files="en/c4-train.00001-of-01024.json.gz",
-    split="train"
-  ).select(range(1024))["text"]
-quant_config = QuantizeConfig(bits=4, group_size=128)
+    calibration_dataset = load_dataset(
+        "allenai/c4",
+        data_files="en/c4-train.00001-of-01024.json.gz",
+        split="train"
+    ).select(range(1024))["text"]
-model = GPTQModel.load(model_id, quant_config)
+    quant_config = QuantizeConfig(bits=4, group_size=128)
-# increase `batch_size` to match gpu/vram specs to speed up quantization
+    model = GPTQModel.load(model_id, quant_config)
-model.quantize(calibration_dataset, batch_size=2)
-model.save(quant_path)
+    # increase `batch_size` to match gpu/vram specs to speed up quantization
-```
+    model.quantize(calibration_dataset, batch_size=2)
+    model.save(quant_path)
+    ```
 ## Running a quantized model with vLLM
@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \
 GPTQModel quantized models are also supported directly through the LLM entrypoint:
-```python
+??? Code
-from vllm import LLM, SamplingParams
+    ```python
-# Sample prompts.
+    from vllm import LLM, SamplingParams
-prompts = [
-    "Hello, my name is",
+    # Sample prompts.
-    "The president of the United States is",
+    prompts = [
-    "The capital of France is",
+        "Hello, my name is",
-    "The future of AI is",
+        "The president of the United States is",
-]
+        "The capital of France is",
+        "The future of AI is",
-# Create a sampling params object.
+    ]
-sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
+    # Create a sampling params object.
-# Create an LLM.
+    sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
-llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+    # Create an LLM.
-# Generate texts from the prompts. The output is a list of RequestOutput objects
+    llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
-# Print the outputs.
+    outputs = llm.generate(prompts, sampling_params)
-print("-"*50)
-for output in outputs:
+    # Print the outputs.
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
    print("-"*50)
-```
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-"*50)
+    ```
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
-```python
+??? Code
-from datasets import load_dataset
-NUM_CALIBRATION_SAMPLES = 512
+    ```python
-MAX_SEQUENCE_LENGTH = 2048
+    from datasets import load_dataset
-# Load and preprocess the dataset
+    NUM_CALIBRATION_SAMPLES = 512
-ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    MAX_SEQUENCE_LENGTH = 2048
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-def preprocess(example):
+    # Load and preprocess the dataset
-    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
-ds = ds.map(preprocess)
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-def tokenize(sample):
+    def preprocess(example):
-    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ds = ds.map(preprocess)
-```
+    def tokenize(sample):
+        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ```
 ### 3. Applying Quantization
 Now, apply the quantization algorithms:
-```python
+??? Code
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-# Configure the quantization algorithms
-recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
-# Apply quantization
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
-# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
+    ```python
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+    from llmcompressor.transformers import oneshot
-model.save_pretrained(SAVE_DIR, save_compressed=True)
+    from llmcompressor.modifiers.quantization import GPTQModifier
-tokenizer.save_pretrained(SAVE_DIR)
+    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-```
+    # Configure the quantization algorithms
+    recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    )
+    # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```
 This process creates a W4A16 model with weights quantized to 4-bit integers.
@@ -137,34 +141,36 @@ $ lm_eval --model vllm \
 The following is an example of an expanded quantization recipe you can tune to your own use case:
-```python
+??? Code
-from compressed_tensors.quantization import (
-    QuantizationArgs,
+    ```python
-    QuantizationScheme,
+    from compressed_tensors.quantization import (
-    QuantizationStrategy,
+        QuantizationArgs,
-    QuantizationType,
+        QuantizationScheme,
-) 
+        QuantizationStrategy,
-recipe = GPTQModifier(
+        QuantizationType,
-    targets="Linear",
+    ) 
-    config_groups={
+    recipe = GPTQModifier(
-        "config_group": QuantizationScheme(
+        targets="Linear",
-            targets=["Linear"],
+        config_groups={
-            weights=QuantizationArgs(
+            "config_group": QuantizationScheme(
-                num_bits=4,
+                targets=["Linear"],
-                type=QuantizationType.INT,
+                weights=QuantizationArgs(
-                strategy=QuantizationStrategy.GROUP,
+                    num_bits=4,
-                group_size=128,
+                    type=QuantizationType.INT,
-                symmetric=True,
+                    strategy=QuantizationStrategy.GROUP,
-                dynamic=False,
+                    group_size=128,
-                actorder="weight",
+                    symmetric=True,
+                    dynamic=False,
+                    actorder="weight",
+                ),
            ),
-        ),
+        },
-    },
+        ignore=["lm_head"],
-    ignore=["lm_head"],
+        update_size=NUM_CALIBRATION_SAMPLES,
-    update_size=NUM_CALIBRATION_SAMPLES,
+        dampening_frac=0.01
-    dampening_frac=0.01
+    )
-)
+    ```
-```
 ## Troubleshooting and Support

--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
-```python
+??? Code
-from datasets import load_dataset
-NUM_CALIBRATION_SAMPLES = 512
+    ```python
-MAX_SEQUENCE_LENGTH = 2048
+    from datasets import load_dataset
-# Load and preprocess the dataset
+    NUM_CALIBRATION_SAMPLES = 512
-ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    MAX_SEQUENCE_LENGTH = 2048
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-def preprocess(example):
+    # Load and preprocess the dataset
-    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
-ds = ds.map(preprocess)
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-def tokenize(sample):
+    def preprocess(example):
-    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ds = ds.map(preprocess)
-```
+    def tokenize(sample):
+        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ```
+</details>
 ### 3. Applying Quantization
 Now, apply the quantization algorithms:
-```python
+??? Code
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
+    ```python
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import GPTQModifier
-# Configure the quantization algorithms
+    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-recipe = [
-    SmoothQuantModifier(smoothing_strength=0.8),
+    # Configure the quantization algorithms
-    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+    recipe = [
-]
+        SmoothQuantModifier(smoothing_strength=0.8),
+        GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
-# Apply quantization
+    ]
-oneshot(
-    model=model,
+    # Apply quantization
-    dataset=ds,
+    oneshot(
-    recipe=recipe,
+        model=model,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
+        dataset=ds,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+        recipe=recipe,
-)
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
+    )
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
+    # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
-tokenizer.save_pretrained(SAVE_DIR)
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
-```
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```
 This process creates a W8A8 model with weights and activations quantized to 8-bit integers.

--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te
 Below is an example showing how to quantize a model using modelopt's PTQ API:
-```python
+??? Code
-import modelopt.torch.quantization as mtq
-from transformers import AutoModelForCausalLM
-# Load the model from HuggingFace
+    ```python
-model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
+    import modelopt.torch.quantization as mtq
+    from transformers import AutoModelForCausalLM
-# Select the quantization config, for example, FP8
+    # Load the model from HuggingFace
-config = mtq.FP8_DEFAULT_CFG
+    model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
-# Define a forward loop function for calibration
+    # Select the quantization config, for example, FP8
-def forward_loop(model):
+    config = mtq.FP8_DEFAULT_CFG
-    for data in calib_set:
-        model(data)
-# PTQ with in-place replacement of quantized modules
+    # Define a forward loop function for calibration
-model = mtq.quantize(model, config, forward_loop)
+    def forward_loop(model):
-```
+        for data in calib_set:
+            model(data)
+    # PTQ with in-place replacement of quantized modules
+    model = mtq.quantize(model, config, forward_loop)
+    ```
 After the model is quantized, you can export it to a quantized checkpoint using the export API:
@@ -48,31 +50,33 @@ with torch.inference_mode():
 The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
-```python
+??? Code
-from vllm import LLM, SamplingParams
-def main():
+    ```python
+    from vllm import LLM, SamplingParams
-    model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
+    def main():
-    # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
-    llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
+        model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
+        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+        llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
-    prompts = [
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    outputs = llm.generate(prompts, sampling_params)
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
-    for output in outputs:
+        outputs = llm.generate(prompts, sampling_params)
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-if __name__ == "__main__":
+        for output in outputs:
-    main()
+            prompt = output.prompt
-```
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    if __name__ == "__main__":
+        main()
+    ```
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -35,20 +35,22 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades
 Here is an example of how to enable FP8 quantization:
-```python
+??? Code
-# To calculate kv cache scales on the fly enable the calculate_kv_scales
-# parameter
-from vllm import LLM, SamplingParams
+    ```python
+    # To calculate kv cache scales on the fly enable the calculate_kv_scales
+    # parameter
-sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
+    from vllm import LLM, SamplingParams
-llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-          kv_cache_dtype="fp8",
+    sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
-          calculate_kv_scales=True)
+    llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-prompt = "London is the capital of"
+            kv_cache_dtype="fp8",
-out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+            calculate_kv_scales=True)
-print(out)
+    prompt = "London is the capital of"
-```
+    out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+    print(out)
+    ```
 The `kv_cache_dtype` argument specifies the data type for KV cache storage:
 - `"auto"`: Uses the model's default "unquantized" data type
@@ -71,67 +73,69 @@ pip install llmcompressor
 Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
-```python
+??? Code
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
+    ```python
-from llmcompressor.transformers import oneshot
+    from datasets import load_dataset
+    from transformers import AutoModelForCausalLM, AutoTokenizer
-# Select model and load it
+    from llmcompressor.transformers import oneshot
-MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+    # Select model and load it
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
-# Select calibration dataset
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
+    # Select calibration dataset
+    DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-# Configure calibration parameters
+    DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 512  # 512 samples is a good starting point
-MAX_SEQUENCE_LENGTH = 2048
+    # Configure calibration parameters
+    NUM_CALIBRATION_SAMPLES = 512  # 512 samples is a good starting point
-# Load and preprocess dataset
+    MAX_SEQUENCE_LENGTH = 2048
-ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+    # Load and preprocess dataset
+    ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
-def process_and_tokenize(example):
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-    text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
-    return tokenizer(
+    def process_and_tokenize(example):
-        text,
+        text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
-        padding=False,
+        return tokenizer(
-        max_length=MAX_SEQUENCE_LENGTH,
+            text,
-        truncation=True,
+            padding=False,
-        add_special_tokens=False,
+            max_length=MAX_SEQUENCE_LENGTH,
+            truncation=True,
+            add_special_tokens=False,
+        )
+    ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
+    # Configure quantization settings
+    recipe = """
+    quant_stage:
+        quant_modifiers:
+            QuantizationModifier:
+                kv_cache_scheme:
+                    num_bits: 8
+                    type: float
+                    strategy: tensor
+                    dynamic: false
+                    symmetric: true
+    """
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    )
-ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
+    # Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
-# Configure quantization settings
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
-recipe = """
+    tokenizer.save_pretrained(SAVE_DIR)
-quant_stage:
+    ```
-    quant_modifiers:
-        QuantizationModifier:
-            kv_cache_scheme:
-                num_bits: 8
-                type: float
-                strategy: tensor
-                dynamic: false
-                symmetric: true
-"""
-# Apply quantization
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
-# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-tokenizer.save_pretrained(SAVE_DIR)
-```
 The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.

--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -42,20 +42,22 @@ The Quark quantization process can be listed for 5 steps as below:
 Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
 to fetch model and tokenizer.
-```python
+??? Code
-from transformers import AutoTokenizer, AutoModelForCausalLM
-MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
+    ```python
-MAX_SEQ_LEN = 512
+    from transformers import AutoTokenizer, AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MAX_SEQ_LEN = 512
-)
-model.eval()
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
+    model = AutoModelForCausalLM.from_pretrained(
-tokenizer.pad_token = tokenizer.eos_token
+        MODEL_ID, device_map="auto", torch_dtype="auto",
-```
+    )
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
+    tokenizer.pad_token = tokenizer.eos_token
+    ```
 ### 2. Prepare the Calibration Dataloader
@@ -63,22 +65,24 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic
 to load calibration data. For more details about how to use calibration datasets efficiently, please refer
 to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
-```python
+??? Code
-from datasets import load_dataset
-from torch.utils.data import DataLoader
-BATCH_SIZE = 1
+    ```python
-NUM_CALIBRATION_DATA = 512
+    from datasets import load_dataset
+    from torch.utils.data import DataLoader
-# Load the dataset and get calibration data.
+    BATCH_SIZE = 1
-dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
+    NUM_CALIBRATION_DATA = 512
-text_data = dataset["text"][:NUM_CALIBRATION_DATA]
-tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+    # Load the dataset and get calibration data.
-    padding=True, truncation=True, max_length=MAX_SEQ_LEN)
+    dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
-calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
+    text_data = dataset["text"][:NUM_CALIBRATION_DATA]
-    batch_size=BATCH_SIZE, drop_last=True)
-```
+    tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+        padding=True, truncation=True, max_length=MAX_SEQ_LEN)
+    calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
+        batch_size=BATCH_SIZE, drop_last=True)
+    ```
 ### 3. Set the Quantization Configuration
@@ -94,42 +98,44 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
    AutoSmoothQuant config file for Llama is
    `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
-```python
+??? Code
-from quark.torch.quantization import (Config, QuantizationConfig,
-                                     FP8E4M3PerTensorSpec,
+    ```python
-                                     load_quant_algo_config_from_file)
+    from quark.torch.quantization import (Config, QuantizationConfig,
+                                        FP8E4M3PerTensorSpec,
-# Define fp8/per-tensor/static spec.
+                                        load_quant_algo_config_from_file)
-FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
-    is_dynamic=False).to_quantization_spec()
+    # Define fp8/per-tensor/static spec.
+    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
-# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
+        is_dynamic=False).to_quantization_spec()
-global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
-    weight=FP8_PER_TENSOR_SPEC)
+    # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
+    global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
-# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
+        weight=FP8_PER_TENSOR_SPEC)
-KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
-kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
+    # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
-kv_cache_quant_config = {name :
+    KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
-    QuantizationConfig(input_tensors=global_quant_config.input_tensors,
+    kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
-                       weight=global_quant_config.weight,
+    kv_cache_quant_config = {name :
-                       output_tensors=KV_CACHE_SPEC)
+        QuantizationConfig(input_tensors=global_quant_config.input_tensors,
-    for name in kv_cache_layer_names_for_llama}
+                        weight=global_quant_config.weight,
-layer_quant_config = kv_cache_quant_config.copy()
+                        output_tensors=KV_CACHE_SPEC)
+        for name in kv_cache_layer_names_for_llama}
-# Define algorithm config by config file.
+    layer_quant_config = kv_cache_quant_config.copy()
-LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
-    'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+    # Define algorithm config by config file.
-algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
+        'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
-EXCLUDE_LAYERS = ["lm_head"]
+    algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
-quant_config = Config(
-    global_quant_config=global_quant_config,
+    EXCLUDE_LAYERS = ["lm_head"]
-    layer_quant_config=layer_quant_config,
+    quant_config = Config(
-    kv_cache_quant_config=kv_cache_quant_config,
+        global_quant_config=global_quant_config,
-    exclude=EXCLUDE_LAYERS,
+        layer_quant_config=layer_quant_config,
-    algo_config=algo_config)
+        kv_cache_quant_config=kv_cache_quant_config,
-```
+        exclude=EXCLUDE_LAYERS,
+        algo_config=algo_config)
+    ```
 ### 4. Quantize the Model and Export
@@ -139,63 +145,67 @@ HuggingFace `safetensors`, you can refer to
 [HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
 for more exporting format details.
-```python
+??? Code
-import torch
-from quark.torch import ModelQuantizer, ModelExporter
+    ```python
-from quark.torch.export import ExporterConfig, JsonExporterConfig
+    import torch
+    from quark.torch import ModelQuantizer, ModelExporter
-# Apply quantization.
+    from quark.torch.export import ExporterConfig, JsonExporterConfig
-quantizer = ModelQuantizer(quant_config)
-quant_model = quantizer.quantize_model(model, calib_dataloader)
+    # Apply quantization.
+    quantizer = ModelQuantizer(quant_config)
-# Freeze quantized model to export.
+    quant_model = quantizer.quantize_model(model, calib_dataloader)
-freezed_model = quantizer.freeze(model)
+    # Freeze quantized model to export.
-# Define export config.
+    freezed_model = quantizer.freeze(model)
-LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
-export_config = ExporterConfig(json_export_config=JsonExporterConfig())
+    # Define export config.
-export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+    LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
+    export_config = ExporterConfig(json_export_config=JsonExporterConfig())
-# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
+    export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
-EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
-exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
+    # Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
-with torch.no_grad():
+    EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
-    exporter.export_safetensors_model(freezed_model,
+    exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
-        quant_config=quant_config, tokenizer=tokenizer)
+    with torch.no_grad():
-```
+        exporter.export_safetensors_model(freezed_model,
+            quant_config=quant_config, tokenizer=tokenizer)
+    ```
 ### 5. Evaluation in vLLM
 Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
-```python
+??? Code
-from vllm import LLM, SamplingParams
+    ```python
-# Sample prompts.
+    from vllm import LLM, SamplingParams
-prompts = [
-    "Hello, my name is",
+    # Sample prompts.
-    "The president of the United States is",
+    prompts = [
-    "The capital of France is",
+        "Hello, my name is",
-    "The future of AI is",
+        "The president of the United States is",
-]
+        "The capital of France is",
-# Create a sampling params object.
+        "The future of AI is",
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ]
+    # Create a sampling params object.
-# Create an LLM.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
-          kv_cache_dtype='fp8',quantization='quark')
+    # Create an LLM.
-# Generate texts from the prompts. The output is a list of RequestOutput objects
+    llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
-# that contain the prompt, generated text, and other information.
+            kv_cache_dtype='fp8',quantization='quark')
-outputs = llm.generate(prompts, sampling_params)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
-# Print the outputs.
+    # that contain the prompt, generated text, and other information.
-print("\nGenerated Outputs:\n" + "-" * 60)
+    outputs = llm.generate(prompts, sampling_params)
-for output in outputs:
+    # Print the outputs.
-    prompt = output.prompt
+    print("\nGenerated Outputs:\n" + "-" * 60)
-    generated_text = output.outputs[0].text
+    for output in outputs:
-    print(f"Prompt:    {prompt!r}")
+        prompt = output.prompt
-    print(f"Output:    {generated_text!r}")
+        generated_text = output.outputs[0].text
-    print("-" * 60)
+        print(f"Prompt:    {prompt!r}")
-```
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+    ```
 Or, you can use `lm_eval` to evaluate accuracy:

--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@@ -15,26 +15,28 @@ pip install \
 ## Quantizing HuggingFace Models
 You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
-```Python
+??? Code
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+    ```Python
-from torchao.quantization import Int8WeightOnlyConfig
+    import torch
+    from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-model_name = "meta-llama/Meta-Llama-3-8B"
+    from torchao.quantization import Int8WeightOnlyConfig
-quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
-quantized_model = AutoModelForCausalLM.from_pretrained(
+    model_name = "meta-llama/Meta-Llama-3-8B"
-    model_name,
+    quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
-    torch_dtype="auto",
+    quantized_model = AutoModelForCausalLM.from_pretrained(
-    device_map="auto",
+        model_name,
-    quantization_config=quantization_config
+        torch_dtype="auto",
-)
+        device_map="auto",
-tokenizer = AutoTokenizer.from_pretrained(model_name)
+        quantization_config=quantization_config
-input_text = "What are we having for dinner?"
+    )
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    input_text = "What are we having for dinner?"
-hub_repo = # YOUR HUB REPO ID
+    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-tokenizer.push_to_hub(hub_repo)
-quantized_model.push_to_hub(hub_repo, safe_serialization=False)
+    hub_repo = # YOUR HUB REPO ID
-```
+    tokenizer.push_to_hub(hub_repo)
+    quantized_model.push_to_hub(hub_repo, safe_serialization=False)
+    ```
 Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -33,34 +33,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
 Next, make a request to the model that should return the reasoning content in the response.
-```python
+??? Code
-from openai import OpenAI
-# Modify OpenAI's API key and API base to use vLLM's API server.
+    ```python
-openai_api_key = "EMPTY"
+    from openai import OpenAI
-openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
+    # Modify OpenAI's API key and API base to use vLLM's API server.
-    api_key=openai_api_key,
+    openai_api_key = "EMPTY"
-    base_url=openai_api_base,
+    openai_api_base = "http://localhost:8000/v1"
-)
-models = client.models.list()
+    client = OpenAI(
-model = models.data[0].id
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
-# Round 1
+    models = client.models.list()
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    model = models.data[0].id
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
-# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
-response = client.chat.completions.create(model=model, messages=messages)
-reasoning_content = response.choices[0].message.reasoning_content
+    # Round 1
-content = response.choices[0].message.content
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
+    # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
+    # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
+    response = client.chat.completions.create(model=model, messages=messages)
-print("reasoning_content:", reasoning_content)
+    reasoning_content = response.choices[0].message.reasoning_content
-print("content:", content)
+    content = response.choices[0].message.content
-```
+    print("reasoning_content:", reasoning_content)
+    print("content:", content)
+    ```
 The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
@@ -68,77 +70,81 @@ The `reasoning_content` field contains the reasoning steps that led to the final
 Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
-```json
+??? Json
-{
-    "id": "chatcmpl-123",
+    ```json
-    "object": "chat.completion.chunk",
+    {
-    "created": 1694268190,
+        "id": "chatcmpl-123",
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+        "object": "chat.completion.chunk",
-    "system_fingerprint": "fp_44709d6fcb",
+        "created": 1694268190,
-    "choices": [
+        "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
-        {
+        "system_fingerprint": "fp_44709d6fcb",
-            "index": 0,
+        "choices": [
-            "delta": {
+            {
-                "role": "assistant",
+                "index": 0,
-                "reasoning_content": "is",
+                "delta": {
-            },
+                    "role": "assistant",
-            "logprobs": null,
+                    "reasoning_content": "is",
-            "finish_reason": null
+                },
-        }
+                "logprobs": null,
-    ]
+                "finish_reason": null
-}
+            }
-```
+        ]
+    }
+    ```
 OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
-```python
+??? Code
-from openai import OpenAI
+    ```python
-# Modify OpenAI's API key and API base to use vLLM's API server.
+    from openai import OpenAI
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
-client = OpenAI(
+    openai_api_base = "http://localhost:8000/v1"
-    api_key=openai_api_key,
-    base_url=openai_api_base,
+    client = OpenAI(
-)
+        api_key=openai_api_key,
+        base_url=openai_api_base,
-models = client.models.list()
+    )
-model = models.data[0].id
+    models = client.models.list()
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+    model = models.data[0].id
-# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
+    messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
-# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
+    # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-stream = client.chat.completions.create(model=model,
+    # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
-                                        messages=messages,
+    # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
-                                        stream=True)
+    stream = client.chat.completions.create(model=model,
+                                            messages=messages,
-print("client: Start streaming chat completions...")
+                                            stream=True)
-printed_reasoning_content = False
-printed_content = False
+    print("client: Start streaming chat completions...")
+    printed_reasoning_content = False
-for chunk in stream:
+    printed_content = False
-    reasoning_content = None
-    content = None
+    for chunk in stream:
-    # Check the content is reasoning_content or content
+        reasoning_content = None
-    if hasattr(chunk.choices[0].delta, "reasoning_content"):
+        content = None
-        reasoning_content = chunk.choices[0].delta.reasoning_content
+        # Check the content is reasoning_content or content
-    elif hasattr(chunk.choices[0].delta, "content"):
+        if hasattr(chunk.choices[0].delta, "reasoning_content"):
-        content = chunk.choices[0].delta.content
+            reasoning_content = chunk.choices[0].delta.reasoning_content
+        elif hasattr(chunk.choices[0].delta, "content"):
-    if reasoning_content is not None:
+            content = chunk.choices[0].delta.content
-        if not printed_reasoning_content:
-            printed_reasoning_content = True
+        if reasoning_content is not None:
-            print("reasoning_content:", end="", flush=True)
+            if not printed_reasoning_content:
-        print(reasoning_content, end="", flush=True)
+                printed_reasoning_content = True
-    elif content is not None:
+                print("reasoning_content:", end="", flush=True)
-        if not printed_content:
+            print(reasoning_content, end="", flush=True)
-            printed_content = True
+        elif content is not None:
-            print("\ncontent:", end="", flush=True)
+            if not printed_content:
-        # Extract and print the content
+                printed_content = True
-        print(content, end="", flush=True)
+                print("\ncontent:", end="", flush=True)
-```
+            # Extract and print the content
+            print(content, end="", flush=True)
+    ```
 Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
@@ -146,41 +152,43 @@ Remember to check whether the `reasoning_content` exists in the response before
 The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
-```python
+??? Code
-from openai import OpenAI
+    ```python
-client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+    from openai import OpenAI
-tools = [{
+    client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
-    "type": "function",
-    "function": {
+    tools = [{
-        "name": "get_weather",
+        "type": "function",
-        "description": "Get the current weather in a given location",
+        "function": {
-        "parameters": {
+            "name": "get_weather",
-            "type": "object",
+            "description": "Get the current weather in a given location",
-            "properties": {
+            "parameters": {
-                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                "type": "object",
-                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                "properties": {
-            },
+                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-            "required": ["location", "unit"]
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                },
+                "required": ["location", "unit"]
+            }
        }
-    }
+    }]
-}]
-response = client.chat.completions.create(
+    response = client.chat.completions.create(
-    model=client.models.list().data[0].id,
+        model=client.models.list().data[0].id,
-    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
-    tools=tools,
+        tools=tools,
-    tool_choice="auto"
+        tool_choice="auto"
-)
+    )
-print(response)
+    print(response)
-tool_call = response.choices[0].message.tool_calls[0].function
+    tool_call = response.choices[0].message.tool_calls[0].function
-print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
+    print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
-print(f"Function called: {tool_call.name}")
+    print(f"Function called: {tool_call.name}")
-print(f"Arguments: {tool_call.arguments}")
+    print(f"Arguments: {tool_call.arguments}")
-```
+    ```
 For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
@@ -192,85 +200,89 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
 You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
-```python
+??? Code
-# import the required packages
+    ```python
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
+    # import the required packages
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              DeltaMessage)
+    from vllm.reasoning import ReasoningParser, ReasoningParserManager
+    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-# define a reasoning parser and register it to vllm
+                                                DeltaMessage)
-# the name list in register_module can be used
-# in --reasoning-parser.
+    # define a reasoning parser and register it to vllm
-@ReasoningParserManager.register_module(["example"])
+    # the name list in register_module can be used
-class ExampleParser(ReasoningParser):
+    # in --reasoning-parser.
-    def __init__(self, tokenizer: AnyTokenizer):
+    @ReasoningParserManager.register_module(["example"])
-        super().__init__(tokenizer)
+    class ExampleParser(ReasoningParser):
+        def __init__(self, tokenizer: AnyTokenizer):
-    def extract_reasoning_content_streaming(
+            super().__init__(tokenizer)
-        self,
-        previous_text: str,
+        def extract_reasoning_content_streaming(
-        current_text: str,
+            self,
-        delta_text: str,
+            previous_text: str,
-        previous_token_ids: Sequence[int],
+            current_text: str,
-        current_token_ids: Sequence[int],
+            delta_text: str,
-        delta_token_ids: Sequence[int],
+            previous_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+            current_token_ids: Sequence[int],
-        """
+            delta_token_ids: Sequence[int],
-        Instance method that should be implemented for extracting reasoning
+        ) -> Union[DeltaMessage, None]:
-        from an incomplete response; for use when handling reasoning calls and
+            """
-        streaming. Has to be an instance method because  it requires state -
+            Instance method that should be implemented for extracting reasoning
-        the current tokens/diffs, but also the information about what has
+            from an incomplete response; for use when handling reasoning calls and
-        previously been parsed and extracted (see constructor)
+            streaming. Has to be an instance method because  it requires state -
-        """
+            the current tokens/diffs, but also the information about what has
+            previously been parsed and extracted (see constructor)
-    def extract_reasoning_content(
+            """
-            self, model_output: str, request: ChatCompletionRequest
-    ) -> tuple[Optional[str], Optional[str]]:
+        def extract_reasoning_content(
-        """
+                self, model_output: str, request: ChatCompletionRequest
-        Extract reasoning content from a complete model-generated string.
+        ) -> tuple[Optional[str], Optional[str]]:
+            """
-        Used for non-streaming responses where we have the entire model response
+            Extract reasoning content from a complete model-generated string.
-        available before sending to the client.
+            Used for non-streaming responses where we have the entire model response
+            available before sending to the client.
+            Parameters:
+            model_output: str
+                The model-generated string to extract reasoning content from.
+            request: ChatCompletionRequest
+                The request object that was used to generate the model_output.
+            Returns:
+            tuple[Optional[str], Optional[str]]
+                A tuple containing the reasoning content and the content.
+            """
+    ```
-        Parameters:
+Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
-        model_output: str
-            The model-generated string to extract reasoning content from.
-        request: ChatCompletionRequest
+??? Code
-            The request object that was used to generate the model_output.
-        Returns:
+    ```python
-        tuple[Optional[str], Optional[str]]
+    @dataclass
-            A tuple containing the reasoning content and the content.
+    class DeepSeekReasoner(Reasoner):
        """
-```
+        Reasoner for DeepSeek R series models.
+        """
-Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
+        start_token_id: int
+        end_token_id: int
-```python
-@dataclass
+        start_token: str = "<think>"
-class DeepSeekReasoner(Reasoner):
+        end_token: str = "</think>"
-    """
-    Reasoner for DeepSeek R series models.
+        @classmethod
-    """
+        def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-    start_token_id: int
+            return cls(start_token_id=tokenizer.encode(
-    end_token_id: int
+                "<think>", add_special_tokens=False)[0],
+                    end_token_id=tokenizer.encode("</think>",
-    start_token: str = "<think>"
+                                                    add_special_tokens=False)[0])
-    end_token: str = "</think>"
+        def is_reasoning_end(self, input_ids: list[int]) -> bool:
-    @classmethod
+            return self.end_token_id in input_ids
-    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
+        ...
-        return cls(start_token_id=tokenizer.encode(
+    ```
-            "<think>", add_special_tokens=False)[0],
-                   end_token_id=tokenizer.encode("</think>",
-                                                 add_special_tokens=False)[0])
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.end_token_id in input_ids
-    ...
-```
 The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.

--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -18,29 +18,31 @@ Speculative decoding is a technique which improves inter-token latency in memory
 The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
-```python
+??? Code
-from vllm import LLM, SamplingParams
+    ```python
-prompts = [
+    from vllm import LLM, SamplingParams
-    "The future of AI is",
-]
+    prompts = [
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+        "The future of AI is",
+    ]
-llm = LLM(
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    model="facebook/opt-6.7b",
-    tensor_parallel_size=1,
+    llm = LLM(
-    speculative_config={
+        model="facebook/opt-6.7b",
-        "model": "facebook/opt-125m",
+        tensor_parallel_size=1,
-        "num_speculative_tokens": 5,
+        speculative_config={
-    },
+            "model": "facebook/opt-125m",
-)
+            "num_speculative_tokens": 5,
-outputs = llm.generate(prompts, sampling_params)
+        },
+    )
-for output in outputs:
+    outputs = llm.generate(prompts, sampling_params)
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
+    for output in outputs:
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        prompt = output.prompt
-```
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
 To perform the same with an online mode launch the server:
@@ -60,69 +62,73 @@ python -m vllm.entrypoints.openai.api_server \
 Then use a client:
-```python
+??? Code
-from openai import OpenAI
+    ```python
-# Modify OpenAI's API key and API base to use vLLM's API server.
+    from openai import OpenAI
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
-client = OpenAI(
+    openai_api_base = "http://localhost:8000/v1"
-    # defaults to os.environ.get("OPENAI_API_KEY")
-    api_key=openai_api_key,
+    client = OpenAI(
-    base_url=openai_api_base,
+        # defaults to os.environ.get("OPENAI_API_KEY")
-)
+        api_key=openai_api_key,
+        base_url=openai_api_base,
-models = client.models.list()
+    )
-model = models.data[0].id
+    models = client.models.list()
-# Completion API
+    model = models.data[0].id
-stream = False
-completion = client.completions.create(
+    # Completion API
-    model=model,
+    stream = False
-    prompt="The future of AI is",
+    completion = client.completions.create(
-    echo=False,
+        model=model,
-    n=1,
+        prompt="The future of AI is",
-    stream=stream,
+        echo=False,
-)
+        n=1,
+        stream=stream,
-print("Completion results:")
+    )
-if stream:
-    for c in completion:
+    print("Completion results:")
-        print(c)
+    if stream:
-else:
+        for c in completion:
-    print(completion)
+            print(c)
-```
+    else:
+        print(completion)
+    ```
 ## Speculating by matching n-grams in the prompt
 The following code configures vLLM to use speculative decoding where proposals are generated by
 matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
-```python
+??? Code
-from vllm import LLM, SamplingParams
+    ```python
-prompts = [
+    from vllm import LLM, SamplingParams
-    "The future of AI is",
-]
+    prompts = [
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+        "The future of AI is",
+    ]
-llm = LLM(
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    model="facebook/opt-6.7b",
-    tensor_parallel_size=1,
+    llm = LLM(
-    speculative_config={
+        model="facebook/opt-6.7b",
-        "method": "ngram",
+        tensor_parallel_size=1,
-        "num_speculative_tokens": 5,
+        speculative_config={
-        "prompt_lookup_max": 4,
+            "method": "ngram",
-    },
+            "num_speculative_tokens": 5,
-)
+            "prompt_lookup_max": 4,
-outputs = llm.generate(prompts, sampling_params)
+        },
+    )
-for output in outputs:
+    outputs = llm.generate(prompts, sampling_params)
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
+    for output in outputs:
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        prompt = output.prompt
-```
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
 ## Speculating using MLP speculators
@@ -131,29 +137,31 @@ draft models that conditioning draft predictions on both context vectors and sam
 For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
 [this technical report](https://arxiv.org/abs/2404.19124).
-```python
+??? Code
-from vllm import LLM, SamplingParams
+    ```python
-prompts = [
+    from vllm import LLM, SamplingParams
-    "The future of AI is",
-]
+    prompts = [
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+        "The future of AI is",
+    ]
-llm = LLM(
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
-    tensor_parallel_size=4,
+    llm = LLM(
-    speculative_config={
+        model="meta-llama/Meta-Llama-3.1-70B-Instruct",
-        "model": "ibm-ai-platform/llama3-70b-accelerator",
+        tensor_parallel_size=4,
-        "draft_tensor_parallel_size": 1,
+        speculative_config={
-    },
+            "model": "ibm-ai-platform/llama3-70b-accelerator",
-)
+            "draft_tensor_parallel_size": 1,
-outputs = llm.generate(prompts, sampling_params)
+        },
+    )
-for output in outputs:
+    outputs = llm.generate(prompts, sampling_params)
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
+    for output in outputs:
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        prompt = output.prompt
-```
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
 Note that these speculative models currently need to be run without tensor parallelism, although
 it is possible to run the main model using tensor parallelism (see example above). Since the
@@ -177,31 +185,33 @@ A variety of speculative models of this type are available on HF hub:
 The following code configures vLLM to use speculative decoding where proposals are generated by
 an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
-```python
+??? Code
-from vllm import LLM, SamplingParams
-prompts = [
+    ```python
-    "The future of AI is",
+    from vllm import LLM, SamplingParams
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-llm = LLM(
+    prompts = [
-    model="meta-llama/Meta-Llama-3-8B-Instruct",
+        "The future of AI is",
-    tensor_parallel_size=4,
+    ]
-    speculative_config={
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-        "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
-        "draft_tensor_parallel_size": 1,
-    },
-)
-outputs = llm.generate(prompts, sampling_params)
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3-8B-Instruct",
+        tensor_parallel_size=4,
+        speculative_config={
+            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "draft_tensor_parallel_size": 1,
+        },
+    )
-for output in outputs:
+    outputs = llm.generate(prompts, sampling_params)
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
 A few important things to consider when using the EAGLE based draft models:

--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -33,39 +33,43 @@ text.
 Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
-```python
+??? Code
-from openai import OpenAI
-client = OpenAI(
+    ```python
-    base_url="http://localhost:8000/v1",
+    from openai import OpenAI
-    api_key="-",
+    client = OpenAI(
-)
+        base_url="http://localhost:8000/v1",
-model = client.models.list().data[0].id
+        api_key="-",
+    )
-completion = client.chat.completions.create(
+    model = client.models.list().data[0].id
-    model=model,
-    messages=[
+    completion = client.chat.completions.create(
-        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+        model=model,
-    ],
+        messages=[
-    extra_body={"guided_choice": ["positive", "negative"]},
+            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
-)
+        ],
-print(completion.choices[0].message.content)
+        extra_body={"guided_choice": ["positive", "negative"]},
-```
+    )
+    print(completion.choices[0].message.content)
+    ```
 The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
-```python
+??? Code
-completion = client.chat.completions.create(
-    model=model,
+    ```python
-    messages=[
+    completion = client.chat.completions.create(
-        {
+        model=model,
-            "role": "user",
+        messages=[
-            "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
+            {
-        }
+                "role": "user",
-    ],
+                "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
-    extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
+            }
-)
+        ],
-print(completion.choices[0].message.content)
+        extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
-```
+    )
+    print(completion.choices[0].message.content)
+    ```
 One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
 For this we can use the `guided_json` parameter in two different ways:
@@ -75,41 +79,43 @@ For this we can use the `guided_json` parameter in two different ways:
 The next example shows how to use the `guided_json` parameter with a Pydantic model:
-```python
+??? Code
-from pydantic import BaseModel
-from enum import Enum
+    ```python
+    from pydantic import BaseModel
-class CarType(str, Enum):
+    from enum import Enum
-    sedan = "sedan"
-    suv = "SUV"
+    class CarType(str, Enum):
-    truck = "Truck"
+        sedan = "sedan"
-    coupe = "Coupe"
+        suv = "SUV"
+        truck = "Truck"
-class CarDescription(BaseModel):
+        coupe = "Coupe"
-    brand: str
-    model: str
+    class CarDescription(BaseModel):
-    car_type: CarType
+        brand: str
+        model: str
-json_schema = CarDescription.model_json_schema()
+        car_type: CarType
-completion = client.chat.completions.create(
+    json_schema = CarDescription.model_json_schema()
-    model=model,
-    messages=[
+    completion = client.chat.completions.create(
-        {
+        model=model,
-            "role": "user",
+        messages=[
-            "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
+            {
-        }
+                "role": "user",
-    ],
+                "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
-    "response_format": {
+            }
-        "type": "json_schema",
+        ],
-        "json_schema": {
+        "response_format": {
-            "name": "car-description",
+            "type": "json_schema",
-            "schema": CarDescription.model_json_schema()
+            "json_schema": {
+                "name": "car-description",
+                "schema": CarDescription.model_json_schema()
+            },
        },
-    },
+    )
-)
+    print(completion.choices[0].message.content)
-print(completion.choices[0].message.content)
+    ```
-```
 !!! tip
    While not strictly necessary, normally it´s better to indicate in the prompt the
@@ -121,33 +127,35 @@ difficult to use, but it´s really powerful. It allows us to define complete
 languages like SQL queries. It works by using a context free EBNF grammar.
 As an example, we can use to define a specific format of simplified SQL queries:
-```python
+??? Code
-simplified_sql_grammar = """
-    root ::= select_statement
-    select_statement ::= "SELECT " column " from " table " where " condition
+    ```python
+    simplified_sql_grammar = """
+        root ::= select_statement
-    column ::= "col_1 " | "col_2 "
+        select_statement ::= "SELECT " column " from " table " where " condition
-    table ::= "table_1 " | "table_2 "
+        column ::= "col_1 " | "col_2 "
-    condition ::= column "= " number
+        table ::= "table_1 " | "table_2 "
-    number ::= "1 " | "2 "
+        condition ::= column "= " number
-"""
-completion = client.chat.completions.create(
+        number ::= "1 " | "2 "
-    model=model,
+    """
-    messages=[
-        {
+    completion = client.chat.completions.create(
-            "role": "user",
+        model=model,
-            "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
+        messages=[
-        }
+            {
-    ],
+                "role": "user",
-    extra_body={"guided_grammar": simplified_sql_grammar},
+                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
-)
+            }
-print(completion.choices[0].message.content)
+        ],
-```
+        extra_body={"guided_grammar": simplified_sql_grammar},
+    )
+    print(completion.choices[0].message.content)
+    ```
 See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
@@ -161,34 +169,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r
 Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
-```python
+??? Code
-from pydantic import BaseModel
+    ```python
+    from pydantic import BaseModel
-class People(BaseModel):
-    name: str
-    age: int
+    class People(BaseModel):
+        name: str
+        age: int
-completion = client.chat.completions.create(
-    model=model,
-    messages=[
+    completion = client.chat.completions.create(
-        {
+        model=model,
-            "role": "user",
+        messages=[
-            "content": "Generate a JSON with the name and age of one random person.",
+            {
-        }
+                "role": "user",
-    ],
+                "content": "Generate a JSON with the name and age of one random person.",
-    response_format={
+            }
-        "type": "json_schema",
+        ],
-        "json_schema": {
+        response_format={
-            "name": "people",
+            "type": "json_schema",
-            "schema": People.model_json_schema()
+            "json_schema": {
-        }
+                "name": "people",
-    },
+                "schema": People.model_json_schema()
-)
+            }
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+        },
-print("content: ", completion.choices[0].message.content)
+    )
-```
+    print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+    print("content: ", completion.choices[0].message.content)
+    ```
 See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
@@ -202,33 +212,33 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.
 Here is a simple example demonstrating how to get structured output using Pydantic models:
-```python
+??? Code
-from pydantic import BaseModel
-from openai import OpenAI
+    ```python
+    from pydantic import BaseModel
-class Info(BaseModel):
+    from openai import OpenAI
-    name: str
-    age: int
+    class Info(BaseModel):
+        name: str
-client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
+        age: int
-model = client.models.list().data[0].id
-completion = client.beta.chat.completions.parse(
+    client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
-    model=model,
+    model = client.models.list().data[0].id
-    messages=[
+    completion = client.beta.chat.completions.parse(
-        {"role": "system", "content": "You are a helpful assistant."},
+        model=model,
-        {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
+        messages=[
-    ],
+            {"role": "system", "content": "You are a helpful assistant."},
-    response_format=Info,
+            {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
-)
+        ],
+        response_format=Info,
-message = completion.choices[0].message
+    )
-print(message)
-assert message.parsed
+    message = completion.choices[0].message
-print("Name:", message.parsed.name)
+    print(message)
-print("Age:", message.parsed.age)
+    assert message.parsed
-```
+    print("Name:", message.parsed.name)
+    print("Age:", message.parsed.age)
-Output:
+    ```
 ```console
 ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
@@ -238,35 +248,37 @@ Age: 28
 Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
-```python
+??? Code
-from typing import List
-from pydantic import BaseModel
+    ```python
-from openai import OpenAI
+    from typing import List
+    from pydantic import BaseModel
-class Step(BaseModel):
+    from openai import OpenAI
-    explanation: str
-    output: str
+    class Step(BaseModel):
+        explanation: str
-class MathResponse(BaseModel):
+        output: str
-    steps: list[Step]
-    final_answer: str
+    class MathResponse(BaseModel):
+        steps: list[Step]
-completion = client.beta.chat.completions.parse(
+        final_answer: str
-    model=model,
-    messages=[
+    completion = client.beta.chat.completions.parse(
-        {"role": "system", "content": "You are a helpful expert math tutor."},
+        model=model,
-        {"role": "user", "content": "Solve 8x + 31 = 2."},
+        messages=[
-    ],
+            {"role": "system", "content": "You are a helpful expert math tutor."},
-    response_format=MathResponse,
+            {"role": "user", "content": "Solve 8x + 31 = 2."},
-)
+        ],
+        response_format=MathResponse,
-message = completion.choices[0].message
+    )
-print(message)
-assert message.parsed
+    message = completion.choices[0].message
-for i, step in enumerate(message.parsed.steps):
+    print(message)
-    print(f"Step #{i}:", step)
+    assert message.parsed
-print("Answer:", message.parsed.final_answer)
+    for i, step in enumerate(message.parsed.steps):
-```
+        print(f"Step #{i}:", step)
+    print("Answer:", message.parsed.final_answer)
+    ```
 Output:
@@ -296,19 +308,21 @@ These parameters can be used in the same way as the parameters from the Online
 Serving examples above. One example for the usage of the `choice` parameter is
 shown below:
-```python
+??? Code
-from vllm import LLM, SamplingParams
-from vllm.sampling_params import GuidedDecodingParams
-llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
+    ```python
+    from vllm import LLM, SamplingParams
+    from vllm.sampling_params import GuidedDecodingParams
-guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
+    llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
-sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-outputs = llm.generate(
+    guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
-    prompts="Classify this sentiment: vLLM is wonderful!",
+    sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
-    sampling_params=sampling_params,
+    outputs = llm.generate(
-)
+        prompts="Classify this sentiment: vLLM is wonderful!",
-print(outputs[0].outputs[0].text)
+        sampling_params=sampling_params,
-```
+    )
+    print(outputs[0].outputs[0].text)
+    ```
 See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -15,44 +15,46 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \
 Next, make a request to the model that should result in it using the available tools:
-```python
+??? Code
-from openai import OpenAI
-import json
+    ```python
+    from openai import OpenAI
-client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+    import json
-def get_weather(location: str, unit: str):
+    client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
-    return f"Getting the weather for {location} in {unit}..."
-tool_functions = {"get_weather": get_weather}
+    def get_weather(location: str, unit: str):
+        return f"Getting the weather for {location} in {unit}..."
-tools = [{
+    tool_functions = {"get_weather": get_weather}
-    "type": "function",
-    "function": {
+    tools = [{
-        "name": "get_weather",
+        "type": "function",
-        "description": "Get the current weather in a given location",
+        "function": {
-        "parameters": {
+            "name": "get_weather",
-            "type": "object",
+            "description": "Get the current weather in a given location",
-            "properties": {
+            "parameters": {
-                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                "type": "object",
-                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                "properties": {
-            },
+                    "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
-            "required": ["location", "unit"]
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+                },
+                "required": ["location", "unit"]
+            }
        }
-    }
+    }]
-}]
+    response = client.chat.completions.create(
-response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
-    model=client.models.list().data[0].id,
+        messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
-    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+        tools=tools,
-    tools=tools,
+        tool_choice="auto"
-    tool_choice="auto"
+    )
-)
+    tool_call = response.choices[0].message.tool_calls[0].function
-tool_call = response.choices[0].message.tool_calls[0].function
+    print(f"Function called: {tool_call.name}")
-print(f"Function called: {tool_call.name}")
+    print(f"Arguments: {tool_call.arguments}")
-print(f"Arguments: {tool_call.arguments}")
+    print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
-print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
+    ```
-```
 Example output:
@@ -301,49 +303,51 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen
 Here is a summary of a plugin file:
-```python
+??? Code
-# import the required packages
+    ```python
-# define a tool parser and register it to vllm
+    # import the required packages
-# the name list in register_module can be used
-# in --tool-call-parser. you can define as many
+    # define a tool parser and register it to vllm
-# tool parsers as you want here.
+    # the name list in register_module can be used
-@ToolParserManager.register_module(["example"])
+    # in --tool-call-parser. you can define as many
-class ExampleToolParser(ToolParser):
+    # tool parsers as you want here.
-    def __init__(self, tokenizer: AnyTokenizer):
+    @ToolParserManager.register_module(["example"])
-        super().__init__(tokenizer)
+    class ExampleToolParser(ToolParser):
+        def __init__(self, tokenizer: AnyTokenizer):
-    # adjust request. e.g.: set skip special tokens
+            super().__init__(tokenizer)
-    # to False for tool call output.
-    def adjust_request(
+        # adjust request. e.g.: set skip special tokens
-            self, request: ChatCompletionRequest) -> ChatCompletionRequest:
+        # to False for tool call output.
-        return request
+        def adjust_request(
+                self, request: ChatCompletionRequest) -> ChatCompletionRequest:
-    # implement the tool call parse for stream call
+            return request
-    def extract_tool_calls_streaming(
-        self,
+        # implement the tool call parse for stream call
-        previous_text: str,
+        def extract_tool_calls_streaming(
-        current_text: str,
+            self,
-        delta_text: str,
+            previous_text: str,
-        previous_token_ids: Sequence[int],
+            current_text: str,
-        current_token_ids: Sequence[int],
+            delta_text: str,
-        delta_token_ids: Sequence[int],
+            previous_token_ids: Sequence[int],
-        request: ChatCompletionRequest,
+            current_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+            delta_token_ids: Sequence[int],
-        return delta
+            request: ChatCompletionRequest,
+        ) -> Union[DeltaMessage, None]:
-    # implement the tool parse for non-stream call
+            return delta
-    def extract_tool_calls(
-        self,
+        # implement the tool parse for non-stream call
-        model_output: str,
+        def extract_tool_calls(
-        request: ChatCompletionRequest,
+            self,
-    ) -> ExtractedToolCallInformation:
+            model_output: str,
-        return ExtractedToolCallInformation(tools_called=False,
+            request: ChatCompletionRequest,
-                                            tool_calls=[],
+        ) -> ExtractedToolCallInformation:
-                                            content=text)
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
-```
+                                                content=text)
+    ```
 Then you can use this plugin in the command line like this.

--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -76,21 +76,23 @@ Currently, there are no pre-built CPU wheels.
 ### Build image from source
-```console
+??? Commands
-$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
+    ```console
-# Launching OpenAI server 
+    $ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
-$ docker run --rm \
-             --privileged=true \
+    # Launching OpenAI server 
-             --shm-size=4g \
+    $ docker run --rm \
-             -p 8000:8000 \
+                --privileged=true \
-             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
+                --shm-size=4g \
-             -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
+                -p 8000:8000 \
-             vllm-cpu-env \
+                -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
-             --model=meta-llama/Llama-3.2-1B-Instruct \
+                -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
-             --dtype=bfloat16 \
+                vllm-cpu-env \
-             other vLLM OpenAI server arguments
+                --model=meta-llama/Llama-3.2-1B-Instruct \
-```
+                --dtype=bfloat16 \
+                other vLLM OpenAI server arguments
+    ```
 !!! tip
    For ARM or Apple silicon, use `docker/Dockerfile.arm`
@@ -144,32 +146,34 @@ vllm serve facebook/opt-125m
 - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
-```console
+??? Commands
-$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
+    ```console
-# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
+    $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
-CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
-0    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+    # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
-1    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+    CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
-2    0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+    0    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
-3    0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+    1    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
-4    0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+    2    0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
-5    0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+    3    0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
-6    0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+    4    0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
-7    0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+    5    0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
-8    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
+    6    0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
-9    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
+    7    0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
-10   0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
+    8    0      0    0 0:0:0:0          yes 2401.0000 800.0000  800.000
-11   0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
+    9    0      0    1 1:1:1:0          yes 2401.0000 800.0000  800.000
-12   0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
+    10   0      0    2 2:2:2:0          yes 2401.0000 800.0000  800.000
-13   0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+    11   0      0    3 3:3:3:0          yes 2401.0000 800.0000  800.000
-14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
+    12   0      0    4 4:4:4:0          yes 2401.0000 800.0000  800.000
-15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
+    13   0      0    5 5:5:5:0          yes 2401.0000 800.0000  800.000
+    14   0      0    6 6:6:6:0          yes 2401.0000 800.0000  800.000
-# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
+    15   0      0    7 7:7:7:0          yes 2401.0000 800.0000  800.000
-$ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference/basic/basic.py
+    # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
-```
+    $ export VLLM_CPU_OMP_THREADS_BIND=0-7
+    $ python examples/offline_inference/basic/basic.py
+    ```
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.