Merge tag 'v0.8.5' into v0.8.5-ori

081057de · zhuwenwen · 7cf5d5c4 · ba41cc90 · 081057de · 081057de
Commit 081057de authored Apr 29, 2025 by zhuwenwen
20 changed files
--- a/docs/source/assets/deployment/open_webui.png
+++ b/docs/source/assets/deployment/open_webui.png
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -177,6 +177,11 @@ def linkcode_resolve(domain, info):
        for part in info['fullname'].split('.'):
            obj = getattr(obj, part)
+            # Skip decorator wrappers by checking if the object is a function
+            # and has a __wrapped__ attribute (which decorators typically set)
+            while hasattr(obj, '__wrapped__'):
+                obj = obj.__wrapped__
            if not (inspect.isclass(obj) or inspect.isfunction(obj)
                    or inspect.ismethod(obj)):
                obj = obj.__class__  # Get the class of the instance

--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -128,11 +128,9 @@ HF processing as well as memory profiling.
 ### For memory profiling
-Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
+Override the abstract methods {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text` and {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data` to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
-to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
-the model so that vLLM can reserve the correct amount of memory for it.
-Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
+Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
 ::::{tab-set}
 :::{tab-item} Basic example: LLaVA
@@ -244,38 +242,45 @@ def get_num_image_tokens(
 ```
 Notice that the number of image tokens doesn't depend on the image width and height.
-We can simply use a dummy `image_size`:
+We can simply use a dummy `image_size` to calculate the multimodal profiling data:
 ```python
+# NOTE: In actuality, this is usually implemented as part of the
+# model's subclass of `BaseProcessingInfo`, but we show it as is
+# here for simplicity.
 def get_image_size_with_most_features(self) -> ImageSize:
    hf_config = self.get_hf_config()
    width = height = hf_config.image_size
    return ImageSize(width=width, height=height)
-def get_dummy_processor_inputs(
+def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
-) -> ProcessorInputs:
+) -> MultiModalDataDict:
    num_images = mm_counts.get("image", 0)
-    processor = self.info.get_hf_processor()
+    target_width, target_height = \
-    image_token = processor.image_token
+        self.info.get_image_size_with_most_features()
-    hf_config = self.get_hf_config()
-    target_width, target_height = self.info.get_image_size_with_most_features()
-    mm_data = {
+    return {
        "image":
        self._get_dummy_images(width=target_width,
                               height=target_height,
                               num_images=num_images)
    }
+```
-    return ProcessorInputs(
+For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
-        prompt_text=image_token * num_images,
-        mm_data=mm_data,
+```python
-    )
+def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+    num_images = mm_counts.get("image", 0)
+    processor = self.info.get_hf_processor()
+    image_token = processor.image_token
+    return image_token * num_images
 ```
 :::
@@ -412,29 +417,30 @@ def get_image_size_with_most_features(self) -> ImageSize:
 Fuyu does not expect image placeholders in the inputs to HF processor, so
 the dummy prompt text is empty regardless of the number of images.
-Otherwise, the logic of this method is very similar to LLaVA:
 ```python
-def get_dummy_processor_inputs(
+def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+    return ""
+```
+For the multimodal image profiling data, the logic is very similar to LLaVA:
+```python
+def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
-) -> ProcessorInputs:
+) -> MultiModalDataDict:
    target_width, target_height = \
        self.info.get_image_size_with_most_features()
    num_images = mm_counts.get("image", 0)
-    mm_data = {
+    return {
        "image":
        self._get_dummy_images(width=target_width,
-                                height=target_height,
+                               height=target_height,
-                                num_images=num_images)
+                               num_images=num_images)
    }
-    return ProcessorInputs(
-        prompt_text="",
-        mm_data=mm_data,
-    )
 ```
 :::

--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -19,6 +19,18 @@ $ docker run --runtime nvidia --gpus all \
    --model mistralai/Mistral-7B-v0.1
 ```
+This image can also be used with other container engines such as [Podman](https://podman.io/).
+```console
+$ podman run --gpus all \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  -p 8000:8000 \
+  --ipc=host \
+  vllm/vllm-openai:latest \
+  --model mistralai/Mistral-7B-v0.1
+```
 You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
 :::{note}

--- a/docs/source/deployment/frameworks/anything-llm.md
+++ b/docs/source/deployment/frameworks/anything-llm.md
+(deployment-anything-llm)=
+# Anything LLM
+[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
+It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
+## Prerequisites
+- Setup vLLM environment
+## Deploy
+- Start the vLLM server with the supported chat completion model, e.g.
+```console
+vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
+```
+- Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
+- On the bottom left of open settings, AI Prooviders --> LLM:
+  - LLM Provider: Generic OpenAI
+  - Base URL: http://{vllm server host}:{vllm server port}/v1
+  - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
+:::{image} /assets/deployment/anything-llm-provider.png
+:::
+- Back to home page, New Workspace --> create `vllm` workspace, and start to chat:
+:::{image} /assets/deployment/anything-llm-chat-without-doc.png
+:::
+- Click the upload button:
+  - upload the doc
+  - select the doc and move to the workspace
+  - save and embed
+:::{image} /assets/deployment/anything-llm-upload-doc.png
+:::
+- Chat again:
+:::{image} /assets/deployment/anything-llm-chat-with-doc.png
+:::
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@@ -3,12 +3,14 @@
 :::{toctree}
 :maxdepth: 1
+anything-llm
 bentoml
 cerebrium
 dstack
 helm
 lws
 modal
+open-webui
 skypilot
 triton
 :::
--- a/docs/source/deployment/frameworks/open-webui.md
+++ b/docs/source/deployment/frameworks/open-webui.md
+(deployment-open-webui)=
+# Open WebUI
+1. Install the [Docker](https://docs.docker.com/engine/install/)
+2. Start the vLLM server with the supported chat completion model, e.g.
+```console
+vllm serve qwen/Qwen1.5-0.5B-Chat
+```
+1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
+```console
+docker run -d -p 3000:8080 \
+--name open-webui \
+-v open-webui:/app/backend/data \
+-e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
+--restart always \
+ghcr.io/open-webui/open-webui:main
+```
+1. Open it in the browser: <http://open-webui-host:3000/>
+On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
+:::{image} /assets/deployment/open_webui.png
+:::
--- a/docs/source/deployment/integrations/production-stack.md
+++ b/docs/source/deployment/integrations/production-stack.md
@@ -16,7 +16,7 @@ Ensure that you have a running Kubernetes environment with GPU (you can follow [
 ## Deployment using vLLM production stack
-The standard vLLM production stack install uses a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/tutorials/install-helm.sh) to install Helm on your GPU server.
+The standard vLLM production stack is installed using a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/utils/install-helm.sh) to install Helm on your GPU server.
 To install the vLLM production stack, run the following commands on your desktop:

--- a/docs/source/deployment/security.md
+++ b/docs/source/deployment/security.md
+# Security Guide
+## Inter-Node Communication
+All communications between nodes in a multi-node vLLM deployment are **insecure by default** and must be protected by placing the nodes on an isolated network. This includes:
+1. PyTorch Distributed communications
+2. KV cache transfer communications
+3. Tensor, Pipeline, and Data parallel communications
+### Configuration Options for Inter-Node Communications
+The following options control inter-node communications in vLLM:
+1. **Environment Variables:**
+   - `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
+2. **KV Cache Transfer Configuration:**
+   - `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
+   - `--kv-port`: The port for KV cache transfer communications (default: 14579)
+3. **Data Parallel Configuration:**
+   - `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
+   - `data_parallel_master_port`: Port of the data parallel master (default: 29500)
+### Notes on PyTorch Distributed
+vLLM uses PyTorch's distributed features for some inter-node communication. For
+detailed information about PyTorch Distributed security considerations, please
+refer to the [PyTorch Security
+Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
+Key points from the PyTorch security guide:
+- PyTorch Distributed features are intended for internal communication only
+- They are not built for use in untrusted environments or networks
+- No authorization protocol is included for performance reasons
+- Messages are sent unencrypted
+- Connections are accepted from anywhere without checks
+### Security Recommendations
+1. **Network Isolation:**
+   - Deploy vLLM nodes on a dedicated, isolated network
+   - Use network segmentation to prevent unauthorized access
+   - Implement appropriate firewall rules
+2. **Configuration Best Practices:**
+   - Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
+   - Configure firewalls to only allow necessary ports between nodes
+3. **Access Control:**
+   - Restrict physical and network access to the deployment environment
+   - Implement proper authentication and authorization for management interfaces
+   - Follow the principle of least privilege for all system components
+## Reporting Security Vulnerabilities
+If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md).
--- a/docs/source/design/mm_processing.md
+++ b/docs/source/design/mm_processing.md
@@ -47,7 +47,7 @@ Moreover, since the tokenized text has not passed through the HF processor, we h
 ### Dummy text
-We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
+We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
 (mm-automatic-prompt-updating)=

--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@@ -66,8 +66,8 @@ vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_
 The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
 - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds
- `vllm:prompt_tokens_total` - Prompt Tokens/Sec
+- `vllm:prompt_tokens_total` - Prompt Tokens
- `vllm:generation_tokens_total` - Generation Tokens/Sec
+- `vllm:generation_tokens_total` - Generation Tokens
 - `vllm:time_per_output_token_seconds` - Inter token latency (Time Per Output Token, TPOT) in second.
 - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
 - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in RUNNING, WAITING, and SWAPPED state
@@ -86,6 +86,17 @@ See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful b
 Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
+With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
+```bash
+$ curl http://0.0.0.0:8000/metrics 2>/dev/null  | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
+http_requests_total{handler="/v1/completions",method="POST",status="2xx"} 201.0
+http_request_size_bytes_count{handler="/v1/completions"} 201.0
+http_response_size_bytes_count{handler="/v1/completions"} 201.0
+http_request_duration_highr_seconds_count 201.0
+http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0
+```
 ### Multi-process Mode
 In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.

--- a/docs/source/design/v1/torch_compile.md
+++ b/docs/source/design/v1/torch_compile.md
@@ -99,7 +99,7 @@ This time, Inductor compilation is completely bypassed, and we will load from di
 The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
-`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
+`vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
 Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
@@ -134,6 +134,6 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
 By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
-`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
+`vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -21,11 +21,11 @@ Disaggregated prefill DOES NOT improve throughput.
 ## Usage example
-Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
+Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
 ## Benchmarks
-Please refer to `benchmarks/disagg_benchmarks/` for disaggregated prefilling benchmarks.
+Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
 ## Development

--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -106,19 +106,18 @@ curl http://localhost:8000/v1/completions \
 ## Dynamically serving LoRA Adapters
-In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
+In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed.
-LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
-to change models on-the-fly is needed.
 Note: Enabling this feature in production environments is risky as users may participate in model adapter management.
-To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
+To enable dynamic LoRA configuration, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
-is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
+is set to `True`.
 ```bash
 export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
 ```
+### Using API Endpoints
 Loading a LoRA Adapter:
 To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
@@ -153,6 +152,58 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \
 }'
 ```
+### Using Plugins
+Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter.
+You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
+You can either install existing plugins or implement your own.
+Steps to implement your own LoRAResolver plugin:
+1. Implement the LoRAResolver interface.
+    Example of a simple S3 LoRAResolver implementation:
+    ```python
+    import os
+    import s3fs
+    from vllm.lora.request import LoRARequest
+    from vllm.lora.resolver import LoRAResolver
+    class S3LoRAResolver(LoRAResolver):
+        def __init__(self):
+            self.s3 = s3fs.S3FileSystem()
+            self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
+            self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
+        async def resolve_lora(self, base_model_name, lora_name):
+            s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+            local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
+            # Download the LoRA from S3 to the local path
+            await self.s3._get(
+                s3_path, local_path, recursive=True, maxdepth=1
+            )
+            lora_request = LoRARequest(
+                lora_name=lora_name,
+                lora_path=local_path,
+                lora_int_id=abs(hash(lora_name))
+            )
+            return lora_request
+    ```
+2. Register LoRAResolver plugin.
+     ```python
+    from vllm.lora.resolver import LoRAResolverRegistry
+    s3_resolver = S3LoRAResolver()
+    LoRAResolverRegistry.register_resolver("s3_resolver", s3_resolver)
+    ```
+    For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md).
 ## New format for `--lora-modules`
 In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:

--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -6,13 +6,13 @@ To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github
 Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
 The main benefits are lower latency and memory usage.
-You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
+You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?search=awq).
 ```console
 pip install autoawq
 ```
-After installing AutoAWQ, you are ready to quantize a model. Please refer to the `AutoAWQ documentation <https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization>`_ for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
+After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
 ```python
 from awq import AutoAWQForCausalLM

--- a/docs/source/features/quantization/bitblas.md
+++ b/docs/source/features/quantization/bitblas.md
+(bitblas)=
+# BitBLAS
+vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
+:::{note}
+Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
+Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
+For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html).
+:::
+Below are the steps to utilize BitBLAS with vLLM.
+```console
+pip install bitblas>=0.1.0
+```
+vLLM reads the model's config file and supports pre-quantized checkpoints.
+You can find pre-quantized models on:
+- [Hugging Face (BitBLAS)](https://huggingface.co/models?search=bitblas)
+- [Hugging Face (GPTQ)](https://huggingface.co/models?search=gptq)
+Usually, these repositories have a `quantize_config.json` file that includes a `quantization_config` section.
+## Read bitblas format checkpoint
+```python
+from vllm import LLM
+import torch
+# "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint.
+model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas"
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas")
+```
+## Read gptq format checkpoint
+```python
+from vllm import LLM
+import torch
+# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
+model_id = "hxbgsyxh/llama-13b-4bit-g-1"
+llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024)
+```
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@@ -14,7 +14,7 @@ pip install bitsandbytes>=0.45.3
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
-You can find bitsandbytes quantized models on <https://huggingface.co/models?other=bitsandbytes>.
+You can find bitsandbytes quantized models on <https://huggingface.co/models?search=bitsandbytes>.
 And usually, these repositories have a config.json file that includes a quantization_config section.
 ## Read quantized checkpoint

--- a/docs/source/features/quantization/gptqmodel.md
+++ b/docs/source/features/quantization/gptqmodel.md
@@ -16,12 +16,16 @@ GPTQModel is one of the few quantization toolkits in the world that allows `Dyna
 is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override)
 for more details on this and other advanced features.
-You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?sort=trending&search=gptq).
+## Installation
+You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?search=gptq).
 ```console
 pip install -U gptqmodel --no-build-isolation -v
 ```
+## Quantizing a model
 After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details.
 Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
@@ -49,12 +53,16 @@ model.quantize(calibration_dataset, batch_size=2)
 model.save(quant_path)
 ```
+## Running a quantized model with vLLM
 To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
 ```console
-python examples/offline_inference/llm_engine_example.py --model DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
+python examples/offline_inference/llm_engine_example.py --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
 ```
+## Using GPTQModel with vLLM's Python API
 GPTQModel quantized models are also supported directly through the LLM entrypoint:
 ```python
@@ -67,17 +75,22 @@ prompts = [
    "The capital of France is",
    "The future of AI is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
 # Create an LLM.
-llm = LLM(model="DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
+print("-"*50)
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    print("-"*50)
 ```
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -11,6 +11,7 @@ Quantization trades off model precision for smaller memory footprint, allowing l
 supported_hardware
 auto_awq
 bnb
+bitblas
 gguf
 gptqmodel
 int4

--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@@ -74,6 +74,17 @@ The table below shows the compatibility of various quantization implementations
  * ❌
  * ❌
  * ❌
+- * BitBLAS (GPTQ)
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ✅︎
+  * ❌
+  * ❌
+  * ❌
+  * ❌
 - * AQLM
  * ✅︎
  * ✅︎