[doc] Fold long code blocks to improve readability (#19926)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>

[doc] Fold long code blocks to improve readability (#19926)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
f17aec0d · Reid · GitHub · 493c2753 · f17aec0d · f17aec0d
Unverified Commit f17aec0d authored Jun 23, 2025 by Reid Committed by GitHub Jun 23, 2025
20 changed files
--- a/docs/design/kernel/paged_attention.md
+++ b/docs/design/kernel/paged_attention.md
@@ -448,27 +448,29 @@ elements of the entire head for all context tokens. However, overall,
 all results for output have been calculated but are just stored in
 different thread register memory.
-```cpp
+??? Code
-float* out_smem = reinterpret_cast<float*>(shared_mem);
-for (int i = NUM_WARPS; i > 1; i /= 2) {
-    // Upper warps write to shared memory.
-    ...
-    float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
-    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-        ...
-        dst[row_idx] = accs[i];
-    }
-    // Lower warps update the output.
+    ```cpp
-    const float* src = &out_smem[warp_idx * HEAD_SIZE];
+    float* out_smem = reinterpret_cast<float*>(shared_mem);
-    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    for (int i = NUM_WARPS; i > 1; i /= 2) {
+        // Upper warps write to shared memory.
        ...
-        accs[i] += src[row_idx];
+        float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+            ...
+            dst[row_idx] = accs[i];
+        }
+        // Lower warps update the output.
+        const float* src = &out_smem[warp_idx * HEAD_SIZE];
+        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+            ...
+            accs[i] += src[row_idx];
+        }
+        // Write out the accs.
    }
+    ```
-    // Write out the accs.
-}
-```
 ## Output

--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -13,28 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (
 vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
-```python
+??? Code
-# inside `setup.py` file
-from setuptools import setup
+    ```python
+    # inside `setup.py` file
-setup(name='vllm_add_dummy_model',
+    from setuptools import setup
-      version='0.1',
-      packages=['vllm_add_dummy_model'],
+    setup(name='vllm_add_dummy_model',
-      entry_points={
+        version='0.1',
-          'vllm.general_plugins':
+        packages=['vllm_add_dummy_model'],
-          ["register_dummy_model = vllm_add_dummy_model:register"]
+        entry_points={
-      })
+            'vllm.general_plugins':
+            ["register_dummy_model = vllm_add_dummy_model:register"]
-# inside `vllm_add_dummy_model.py` file
+        })
-def register():
-    from vllm import ModelRegistry
+    # inside `vllm_add_dummy_model.py` file
+    def register():
-    if "MyLlava" not in ModelRegistry.get_supported_archs():
+        from vllm import ModelRegistry
-        ModelRegistry.register_model(
-            "MyLlava",
+        if "MyLlava" not in ModelRegistry.get_supported_archs():
-            "vllm_add_dummy_model.my_llava:MyLlava",
+            ModelRegistry.register_model(
-        )
+                "MyLlava",
-```
+                "vllm_add_dummy_model.my_llava:MyLlava",
+            )
+    ```
 For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).

--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
 of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
 the third parameter is the path to the LoRA adapter.
-```python
+??? Code
-sampling_params = SamplingParams(
-    temperature=0,
+    ```python
-    max_tokens=256,
+    sampling_params = SamplingParams(
-    stop=["[/assistant]"]
+        temperature=0,
-)
+        max_tokens=256,
+        stop=["[/assistant]"]
-prompts = [
+    )
-     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
-     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
+    prompts = [
-]
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
-outputs = llm.generate(
+    ]
-    prompts,
-    sampling_params,
+    outputs = llm.generate(
-    lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+        prompts,
-)
+        sampling_params,
-```
+        lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
+    )
+    ```
 Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
@@ -68,24 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
 etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
 with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
-```bash
+??? Command
-curl localhost:8000/v1/models | jq .
-{
+    ```bash
-    "object": "list",
+    curl localhost:8000/v1/models | jq .
-    "data": [
+    {
-        {
+        "object": "list",
-            "id": "meta-llama/Llama-2-7b-hf",
+        "data": [
-            "object": "model",
+            {
-            ...
+                "id": "meta-llama/Llama-2-7b-hf",
-        },
+                "object": "model",
-        {
+                ...
-            "id": "sql-lora",
+            },
-            "object": "model",
+            {
-            ...
+                "id": "sql-lora",
-        }
+                "object": "model",
-    ]
+                ...
-}
+            }
-```
+        ]
+    }
+    ```
 Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
 processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
@@ -168,36 +172,36 @@ Alternatively, follow these example steps to implement your own plugin:
 1. Implement the LoRAResolver interface.
-    Example of a simple S3 LoRAResolver implementation:
+    ??? Example of a simple S3 LoRAResolver implementation
-    ```python
+        ```python
-    import os
+        import os
-    import s3fs
+        import s3fs
-    from vllm.lora.request import LoRARequest
+        from vllm.lora.request import LoRARequest
-    from vllm.lora.resolver import LoRAResolver
+        from vllm.lora.resolver import LoRAResolver
-    class S3LoRAResolver(LoRAResolver):
+        class S3LoRAResolver(LoRAResolver):
-        def __init__(self):
+            def __init__(self):
-            self.s3 = s3fs.S3FileSystem()
+                self.s3 = s3fs.S3FileSystem()
-            self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
+                self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
-            self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
+                self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
-        async def resolve_lora(self, base_model_name, lora_name):
+            async def resolve_lora(self, base_model_name, lora_name):
-            s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+                s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
-            local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+                local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
-            # Download the LoRA from S3 to the local path
+                # Download the LoRA from S3 to the local path
-            await self.s3._get(
+                await self.s3._get(
-                s3_path, local_path, recursive=True, maxdepth=1
+                    s3_path, local_path, recursive=True, maxdepth=1
-            )
+                )
-            lora_request = LoRARequest(
+                lora_request = LoRARequest(
-                lora_name=lora_name,
+                    lora_name=lora_name,
-                lora_path=local_path,
+                    lora_path=local_path,
-                lora_int_id=abs(hash(lora_name))
+                    lora_int_id=abs(hash(lora_name))
-            )
+                )
-            return lora_request
+                return lora_request
-    ```
+        ```
 2. Register `LoRAResolver` plugin.
@@ -234,38 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
 - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
 - The `root` field points to the artifact location of the lora adapter.
-```bash
+??? Command output
-$ curl http://localhost:8000/v1/models
+    ```bash
-{
+    $ curl http://localhost:8000/v1/models
-    "object": "list",
-    "data": [
+    {
-        {
+        "object": "list",
-        "id": "meta-llama/Llama-2-7b-hf",
+        "data": [
-        "object": "model",
-        "created": 1715644056,
-        "owned_by": "vllm",
-        "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
-        "parent": null,
-        "permission": [
            {
-            .....
+            "id": "meta-llama/Llama-2-7b-hf",
-            }
+            "object": "model",
-        ]
+            "created": 1715644056,
-        },
+            "owned_by": "vllm",
-        {
+            "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
-        "id": "sql-lora",
+            "parent": null,
-        "object": "model",
+            "permission": [
-        "created": 1715644056,
+                {
-        "owned_by": "vllm",
+                .....
-        "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
+                }
-        "parent": meta-llama/Llama-2-7b-hf,
+            ]
-        "permission": [
+            },
            {
-            ....
+            "id": "sql-lora",
+            "object": "model",
+            "created": 1715644056,
+            "owned_by": "vllm",
+            "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
+            "parent": meta-llama/Llama-2-7b-hf,
+            "permission": [
+                {
+                ....
+                }
+            ]
            }
        ]
-        }
+    }
-    ]
+    ```
-}
-```
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@@ -15,29 +15,31 @@ pip install autoawq
 After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
-```python
+??? Code
-from awq import AutoAWQForCausalLM
-from transformers import AutoTokenizer
-model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
+    ```python
-quant_path = 'mistral-instruct-v0.2-awq'
+    from awq import AutoAWQForCausalLM
-quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+    from transformers import AutoTokenizer
-# Load model
+    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-model = AutoAWQForCausalLM.from_pretrained(
+    quant_path = 'mistral-instruct-v0.2-awq'
-    model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
-)
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-# Quantize
+    # Load model
-model.quantize(tokenizer, quant_config=quant_config)
+    model = AutoAWQForCausalLM.from_pretrained(
+        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-# Save quantized model
+    # Quantize
-model.save_quantized(quant_path)
+    model.quantize(tokenizer, quant_config=quant_config)
-tokenizer.save_pretrained(quant_path)
-print(f'Model is quantized and saved at "{quant_path}"')
+    # Save quantized model
-```
+    model.save_quantized(quant_path)
+    tokenizer.save_pretrained(quant_path)
+    print(f'Model is quantized and saved at "{quant_path}"')
+    ```
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
@@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \
 AWQ models are also supported directly through the LLM entrypoint:
-```python
+??? Code
-from vllm import LLM, SamplingParams
+    ```python
-# Sample prompts.
+    from vllm import LLM, SamplingParams
-prompts = [
-    "Hello, my name is",
+    # Sample prompts.
-    "The president of the United States is",
+    prompts = [
-    "The capital of France is",
+        "Hello, my name is",
-    "The future of AI is",
+        "The president of the United States is",
-]
+        "The capital of France is",
-# Create a sampling params object.
+        "The future of AI is",
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    ]
+    # Create a sampling params object.
-# Create an LLM.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
+    # Create an LLM.
-# that contain the prompt, generated text, and other information.
+    llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
-outputs = llm.generate(prompts, sampling_params)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
-# Print the outputs.
+    # that contain the prompt, generated text, and other information.
-for output in outputs:
+    outputs = llm.generate(prompts, sampling_params)
-    prompt = output.prompt
+    # Print the outputs.
-    generated_text = output.outputs[0].text
+    for output in outputs:
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        prompt = output.prompt
-```
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    ```
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -43,17 +43,19 @@ llm = LLM(
 ## Read gptq format checkpoint
-```python
+??? Code
-from vllm import LLM
-import torch
+    ```python
+    from vllm import LLM
-# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
+    import torch
-model_id = "hxbgsyxh/llama-13b-4bit-g-1"
-llm = LLM(
+    # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
-    model=model_id,
+    model_id = "hxbgsyxh/llama-13b-4bit-g-1"
-    dtype=torch.float16,
+    llm = LLM(
-    trust_remote_code=True,
+        model=model_id,
-    quantization="bitblas",
+        dtype=torch.float16,
-    max_model_len=1024
+        trust_remote_code=True,
-)
+        quantization="bitblas",
-```
+        max_model_len=1024
+    )
+    ```
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -58,22 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r
 Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
-```python
+??? Code
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-# Configure the simple PTQ quantization
+    ```python
-recipe = QuantizationModifier(
+    from llmcompressor.transformers import oneshot
-  targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+    from llmcompressor.modifiers.quantization import QuantizationModifier
-# Apply the quantization algorithm.
+    # Configure the simple PTQ quantization
-oneshot(model=model, recipe=recipe)
+    recipe = QuantizationModifier(
+      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
-# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
+    # Apply the quantization algorithm.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+    oneshot(model=model, recipe=recipe)
-model.save_pretrained(SAVE_DIR)
-tokenizer.save_pretrained(SAVE_DIR)
+    # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
-```
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+    model.save_pretrained(SAVE_DIR)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```
 ### 3. Evaluating Accuracy

--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -41,42 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
 You can also use the GGUF model directly through the LLM entrypoint:
-```python
+??? Code
-from vllm import LLM, SamplingParams
+      ```python
-# In this script, we demonstrate how to pass input to the chat method:
+      from vllm import LLM, SamplingParams
-conversation = [
-   {
+      # In this script, we demonstrate how to pass input to the chat method:
-      "role": "system",
+      conversation = [
-      "content": "You are a helpful assistant"
+         {
-   },
+            "role": "system",
-   {
+            "content": "You are a helpful assistant"
-      "role": "user",
+         },
-      "content": "Hello"
+         {
-   },
+            "role": "user",
-   {
+            "content": "Hello"
-      "role": "assistant",
+         },
-      "content": "Hello! How can I assist you today?"
+         {
-   },
+            "role": "assistant",
-   {
+            "content": "Hello! How can I assist you today?"
-      "role": "user",
+         },
-      "content": "Write an essay about the importance of higher education.",
+         {
-   },
+            "role": "user",
-]
+            "content": "Write an essay about the importance of higher education.",
+         },
-# Create a sampling params object.
+      ]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+      # Create a sampling params object.
-# Create an LLM.
+      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+      # Create an LLM.
-# Generate texts from the prompts. The output is a list of RequestOutput objects
+      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-# that contain the prompt, generated text, and other information.
+               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-outputs = llm.chat(conversation, sampling_params)
+      # Generate texts from the prompts. The output is a list of RequestOutput objects
+      # that contain the prompt, generated text, and other information.
-# Print the outputs.
+      outputs = llm.chat(conversation, sampling_params)
-for output in outputs:
-   prompt = output.prompt
+      # Print the outputs.
-   generated_text = output.outputs[0].text
+      for output in outputs:
-   print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+         prompt = output.prompt
-```
+         generated_text = output.outputs[0].text
+         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+      ```
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
 Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
-```python
+??? Code
-from datasets import load_dataset
-from gptqmodel import GPTQModel, QuantizeConfig
-model_id = "meta-llama/Llama-3.2-1B-Instruct"
+    ```python
-quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
+    from datasets import load_dataset
+    from gptqmodel import GPTQModel, QuantizeConfig
-calibration_dataset = load_dataset(
+    model_id = "meta-llama/Llama-3.2-1B-Instruct"
-    "allenai/c4",
+    quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
-    data_files="en/c4-train.00001-of-01024.json.gz",
-    split="train"
-  ).select(range(1024))["text"]
-quant_config = QuantizeConfig(bits=4, group_size=128)
+    calibration_dataset = load_dataset(
+        "allenai/c4",
+        data_files="en/c4-train.00001-of-01024.json.gz",
+        split="train"
+    ).select(range(1024))["text"]
-model = GPTQModel.load(model_id, quant_config)
+    quant_config = QuantizeConfig(bits=4, group_size=128)
-# increase `batch_size` to match gpu/vram specs to speed up quantization
+    model = GPTQModel.load(model_id, quant_config)
-model.quantize(calibration_dataset, batch_size=2)
-model.save(quant_path)
+    # increase `batch_size` to match gpu/vram specs to speed up quantization
-```
+    model.quantize(calibration_dataset, batch_size=2)
+    model.save(quant_path)
+    ```
 ## Running a quantized model with vLLM
@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \
 GPTQModel quantized models are also supported directly through the LLM entrypoint:
-```python
+??? Code
-from vllm import LLM, SamplingParams
+    ```python
-# Sample prompts.
+    from vllm import LLM, SamplingParams
-prompts = [
-    "Hello, my name is",
+    # Sample prompts.
-    "The president of the United States is",
+    prompts = [
-    "The capital of France is",
+        "Hello, my name is",
-    "The future of AI is",
+        "The president of the United States is",
-]
+        "The capital of France is",
+        "The future of AI is",
-# Create a sampling params object.
+    ]
-sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
+    # Create a sampling params object.
-# Create an LLM.
+    sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
-llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+    # Create an LLM.
-# Generate texts from the prompts. The output is a list of RequestOutput objects
+    llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
-# Print the outputs.
+    outputs = llm.generate(prompts, sampling_params)
-print("-"*50)
-for output in outputs:
+    # Print the outputs.
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
    print("-"*50)
-```
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-"*50)
+    ```
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
-```python
+??? Code
-from datasets import load_dataset
-NUM_CALIBRATION_SAMPLES = 512
+    ```python
-MAX_SEQUENCE_LENGTH = 2048
+    from datasets import load_dataset
-# Load and preprocess the dataset
+    NUM_CALIBRATION_SAMPLES = 512
-ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    MAX_SEQUENCE_LENGTH = 2048
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-def preprocess(example):
+    # Load and preprocess the dataset
-    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
-ds = ds.map(preprocess)
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-def tokenize(sample):
+    def preprocess(example):
-    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ds = ds.map(preprocess)
-```
+    def tokenize(sample):
+        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ```
 ### 3. Applying Quantization
 Now, apply the quantization algorithms:
-```python
+??? Code
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-# Configure the quantization algorithms
-recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
-# Apply quantization
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-)
-# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
+    ```python
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+    from llmcompressor.transformers import oneshot
-model.save_pretrained(SAVE_DIR, save_compressed=True)
+    from llmcompressor.modifiers.quantization import GPTQModifier
-tokenizer.save_pretrained(SAVE_DIR)
+    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-```
+    # Configure the quantization algorithms
+    recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+    # Apply quantization
+    oneshot(
+        model=model,
+        dataset=ds,
+        recipe=recipe,
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    )
+    # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```
 This process creates a W4A16 model with weights quantized to 4-bit integers.
@@ -137,34 +141,36 @@ $ lm_eval --model vllm \
 The following is an example of an expanded quantization recipe you can tune to your own use case:
-```python
+??? Code
-from compressed_tensors.quantization import (
-    QuantizationArgs,
+    ```python
-    QuantizationScheme,
+    from compressed_tensors.quantization import (
-    QuantizationStrategy,
+        QuantizationArgs,
-    QuantizationType,
+        QuantizationScheme,
-) 
+        QuantizationStrategy,
-recipe = GPTQModifier(
+        QuantizationType,
-    targets="Linear",
+    ) 
-    config_groups={
+    recipe = GPTQModifier(
-        "config_group": QuantizationScheme(
+        targets="Linear",
-            targets=["Linear"],
+        config_groups={
-            weights=QuantizationArgs(
+            "config_group": QuantizationScheme(
-                num_bits=4,
+                targets=["Linear"],
-                type=QuantizationType.INT,
+                weights=QuantizationArgs(
-                strategy=QuantizationStrategy.GROUP,
+                    num_bits=4,
-                group_size=128,
+                    type=QuantizationType.INT,
-                symmetric=True,
+                    strategy=QuantizationStrategy.GROUP,
-                dynamic=False,
+                    group_size=128,
-                actorder="weight",
+                    symmetric=True,
+                    dynamic=False,
+                    actorder="weight",
+                ),
            ),
-        ),
+        },
-    },
+        ignore=["lm_head"],
-    ignore=["lm_head"],
+        update_size=NUM_CALIBRATION_SAMPLES,
-    update_size=NUM_CALIBRATION_SAMPLES,
+        dampening_frac=0.01
-    dampening_frac=0.01
+    )
-)
+    ```
-```
 ## Troubleshooting and Support

--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa
 It's best to use calibration data that closely matches your deployment data.
 For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
-```python
+??? Code
-from datasets import load_dataset
-NUM_CALIBRATION_SAMPLES = 512
+    ```python
-MAX_SEQUENCE_LENGTH = 2048
+    from datasets import load_dataset
-# Load and preprocess the dataset
+    NUM_CALIBRATION_SAMPLES = 512
-ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
+    MAX_SEQUENCE_LENGTH = 2048
-ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-def preprocess(example):
+    # Load and preprocess the dataset
-    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
+    ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
-ds = ds.map(preprocess)
+    ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
-def tokenize(sample):
+    def preprocess(example):
-    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+        return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
-ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ds = ds.map(preprocess)
-```
+    def tokenize(sample):
+        return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    ```
+</details>
 ### 3. Applying Quantization
 Now, apply the quantization algorithms:
-```python
+??? Code
-from llmcompressor.transformers import oneshot
-from llmcompressor.modifiers.quantization import GPTQModifier
+    ```python
-from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+    from llmcompressor.transformers import oneshot
+    from llmcompressor.modifiers.quantization import GPTQModifier
-# Configure the quantization algorithms
+    from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-recipe = [
-    SmoothQuantModifier(smoothing_strength=0.8),
+    # Configure the quantization algorithms
-    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+    recipe = [
-]
+        SmoothQuantModifier(smoothing_strength=0.8),
+        GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
-# Apply quantization
+    ]
-oneshot(
-    model=model,
+    # Apply quantization
-    dataset=ds,
+    oneshot(
-    recipe=recipe,
+        model=model,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
+        dataset=ds,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+        recipe=recipe,
-)
+        max_seq_length=MAX_SEQUENCE_LENGTH,
+        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
+    )
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
+    # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
-tokenizer.save_pretrained(SAVE_DIR)
+    SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
-```
+    model.save_pretrained(SAVE_DIR, save_compressed=True)
+    tokenizer.save_pretrained(SAVE_DIR)
+    ```
 This process creates a W8A8 model with weights and activations quantized to 8-bit integers.

--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te
 Below is an example showing how to quantize a model using modelopt's PTQ API:
-```python
+??? Code
-import modelopt.torch.quantization as mtq
-from transformers import AutoModelForCausalLM
-# Load the model from HuggingFace
+    ```python
-model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
+    import modelopt.torch.quantization as mtq
+    from transformers import AutoModelForCausalLM
-# Select the quantization config, for example, FP8
+    # Load the model from HuggingFace
-config = mtq.FP8_DEFAULT_CFG
+    model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
-# Define a forward loop function for calibration
+    # Select the quantization config, for example, FP8
-def forward_loop(model):
+    config = mtq.FP8_DEFAULT_CFG
-    for data in calib_set:
-        model(data)
-# PTQ with in-place replacement of quantized modules
+    # Define a forward loop function for calibration
-model = mtq.quantize(model, config, forward_loop)
+    def forward_loop(model):
-```
+        for data in calib_set:
+            model(data)
+    # PTQ with in-place replacement of quantized modules
+    model = mtq.quantize(model, config, forward_loop)
+    ```
 After the model is quantized, you can export it to a quantized checkpoint using the export API:
@@ -48,31 +50,33 @@ with torch.inference_mode():
 The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
-```python
+??? Code
-from vllm import LLM, SamplingParams
-def main():
+    ```python
+    from vllm import LLM, SamplingParams
-    model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
+    def main():
-    # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
-    llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
+        model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
+        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+        llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
-    prompts = [
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    outputs = llm.generate(prompts, sampling_params)
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
-    for output in outputs:
+        outputs = llm.generate(prompts, sampling_params)
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-if __name__ == "__main__":
+        for output in outputs:
-    main()
+            prompt = output.prompt
-```
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    if __name__ == "__main__":
+        main()
+    ```
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@@ -15,26 +15,28 @@ pip install \
 ## Quantizing HuggingFace Models
 You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
-```Python
+??? Code
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+    ```Python
-from torchao.quantization import Int8WeightOnlyConfig
+    import torch
+    from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-model_name = "meta-llama/Meta-Llama-3-8B"
+    from torchao.quantization import Int8WeightOnlyConfig
-quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
-quantized_model = AutoModelForCausalLM.from_pretrained(
+    model_name = "meta-llama/Meta-Llama-3-8B"
-    model_name,
+    quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
-    torch_dtype="auto",
+    quantized_model = AutoModelForCausalLM.from_pretrained(
-    device_map="auto",
+        model_name,
-    quantization_config=quantization_config
+        torch_dtype="auto",
-)
+        device_map="auto",
-tokenizer = AutoTokenizer.from_pretrained(model_name)
+        quantization_config=quantization_config
-input_text = "What are we having for dinner?"
+    )
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    input_text = "What are we having for dinner?"
-hub_repo = # YOUR HUB REPO ID
+    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-tokenizer.push_to_hub(hub_repo)
-quantized_model.push_to_hub(hub_repo, safe_serialization=False)
+    hub_repo = # YOUR HUB REPO ID
-```
+    tokenizer.push_to_hub(hub_repo)
+    quantized_model.push_to_hub(hub_repo, safe_serialization=False)
+    ```
 Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md