Commit 081057de authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.5' into v0.8.5-ori

parents 7cf5d5c4 ba41cc90
...@@ -177,6 +177,11 @@ def linkcode_resolve(domain, info): ...@@ -177,6 +177,11 @@ def linkcode_resolve(domain, info):
for part in info['fullname'].split('.'): for part in info['fullname'].split('.'):
obj = getattr(obj, part) obj = getattr(obj, part)
# Skip decorator wrappers by checking if the object is a function
# and has a __wrapped__ attribute (which decorators typically set)
while hasattr(obj, '__wrapped__'):
obj = obj.__wrapped__
if not (inspect.isclass(obj) or inspect.isfunction(obj) if not (inspect.isclass(obj) or inspect.isfunction(obj)
or inspect.ismethod(obj)): or inspect.ismethod(obj)):
obj = obj.__class__ # Get the class of the instance obj = obj.__class__ # Get the class of the instance
......
...@@ -128,11 +128,9 @@ HF processing as well as memory profiling. ...@@ -128,11 +128,9 @@ HF processing as well as memory profiling.
### For memory profiling ### For memory profiling
Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs` Override the abstract methods {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text` and {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data` to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
the model so that vLLM can reserve the correct amount of memory for it.
Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens. Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
::::{tab-set} ::::{tab-set}
:::{tab-item} Basic example: LLaVA :::{tab-item} Basic example: LLaVA
...@@ -244,38 +242,45 @@ def get_num_image_tokens( ...@@ -244,38 +242,45 @@ def get_num_image_tokens(
``` ```
Notice that the number of image tokens doesn't depend on the image width and height. Notice that the number of image tokens doesn't depend on the image width and height.
We can simply use a dummy `image_size`: We can simply use a dummy `image_size` to calculate the multimodal profiling data:
```python ```python
# NOTE: In actuality, this is usually implemented as part of the
# model's subclass of `BaseProcessingInfo`, but we show it as is
# here for simplicity.
def get_image_size_with_most_features(self) -> ImageSize: def get_image_size_with_most_features(self) -> ImageSize:
hf_config = self.get_hf_config() hf_config = self.get_hf_config()
width = height = hf_config.image_size width = height = hf_config.image_size
return ImageSize(width=width, height=height) return ImageSize(width=width, height=height)
def get_dummy_processor_inputs( def get_dummy_mm_data(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
) -> ProcessorInputs: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
processor = self.info.get_hf_processor() target_width, target_height = \
image_token = processor.image_token self.info.get_image_size_with_most_features()
hf_config = self.get_hf_config()
target_width, target_height = self.info.get_image_size_with_most_features()
mm_data = { return {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images) num_images=num_images)
} }
```
return ProcessorInputs( For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
prompt_text=image_token * num_images,
mm_data=mm_data, ```python
) def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_images = mm_counts.get("image", 0)
processor = self.info.get_hf_processor()
image_token = processor.image_token
return image_token * num_images
``` ```
::: :::
...@@ -412,29 +417,30 @@ def get_image_size_with_most_features(self) -> ImageSize: ...@@ -412,29 +417,30 @@ def get_image_size_with_most_features(self) -> ImageSize:
Fuyu does not expect image placeholders in the inputs to HF processor, so Fuyu does not expect image placeholders in the inputs to HF processor, so
the dummy prompt text is empty regardless of the number of images. the dummy prompt text is empty regardless of the number of images.
Otherwise, the logic of this method is very similar to LLaVA:
```python ```python
def get_dummy_processor_inputs( def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
return ""
```
For the multimodal image profiling data, the logic is very similar to LLaVA:
```python
def get_dummy_mm_data(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
) -> ProcessorInputs: ) -> MultiModalDataDict:
target_width, target_height = \ target_width, target_height = \
self.info.get_image_size_with_most_features() self.info.get_image_size_with_most_features()
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
mm_data = { return {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(width=target_width,
height=target_height, height=target_height,
num_images=num_images) num_images=num_images)
} }
return ProcessorInputs(
prompt_text="",
mm_data=mm_data,
)
``` ```
::: :::
......
...@@ -19,6 +19,18 @@ $ docker run --runtime nvidia --gpus all \ ...@@ -19,6 +19,18 @@ $ docker run --runtime nvidia --gpus all \
--model mistralai/Mistral-7B-v0.1 --model mistralai/Mistral-7B-v0.1
``` ```
This image can also be used with other container engines such as [Podman](https://podman.io/).
```console
$ podman run --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-p 8000:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--model mistralai/Mistral-7B-v0.1
```
You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`). You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
:::{note} :::{note}
......
(deployment-anything-llm)=
# Anything LLM
[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
## Prerequisites
- Setup vLLM environment
## Deploy
- Start the vLLM server with the supported chat completion model, e.g.
```console
vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
```
- Download and install [Anything LLM desktop](https://anythingllm.com/desktop).
- On the bottom left of open settings, AI Prooviders --> LLM:
- LLM Provider: Generic OpenAI
- Base URL: http://{vllm server host}:{vllm server port}/v1
- Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
:::{image} /assets/deployment/anything-llm-provider.png
:::
- Back to home page, New Workspace --> create `vllm` workspace, and start to chat:
:::{image} /assets/deployment/anything-llm-chat-without-doc.png
:::
- Click the upload button:
- upload the doc
- select the doc and move to the workspace
- save and embed
:::{image} /assets/deployment/anything-llm-upload-doc.png
:::
- Chat again:
:::{image} /assets/deployment/anything-llm-chat-with-doc.png
:::
...@@ -3,12 +3,14 @@ ...@@ -3,12 +3,14 @@
:::{toctree} :::{toctree}
:maxdepth: 1 :maxdepth: 1
anything-llm
bentoml bentoml
cerebrium cerebrium
dstack dstack
helm helm
lws lws
modal modal
open-webui
skypilot skypilot
triton triton
::: :::
(deployment-open-webui)=
# Open WebUI
1. Install the [Docker](https://docs.docker.com/engine/install/)
2. Start the vLLM server with the supported chat completion model, e.g.
```console
vllm serve qwen/Qwen1.5-0.5B-Chat
```
1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
```console
docker run -d -p 3000:8080 \
--name open-webui \
-v open-webui:/app/backend/data \
-e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
--restart always \
ghcr.io/open-webui/open-webui:main
```
1. Open it in the browser: <http://open-webui-host:3000/>
On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
:::{image} /assets/deployment/open_webui.png
:::
...@@ -16,7 +16,7 @@ Ensure that you have a running Kubernetes environment with GPU (you can follow [ ...@@ -16,7 +16,7 @@ Ensure that you have a running Kubernetes environment with GPU (you can follow [
## Deployment using vLLM production stack ## Deployment using vLLM production stack
The standard vLLM production stack install uses a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/tutorials/install-helm.sh) to install Helm on your GPU server. The standard vLLM production stack is installed using a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/utils/install-helm.sh) to install Helm on your GPU server.
To install the vLLM production stack, run the following commands on your desktop: To install the vLLM production stack, run the following commands on your desktop:
......
# Security Guide
## Inter-Node Communication
All communications between nodes in a multi-node vLLM deployment are **insecure by default** and must be protected by placing the nodes on an isolated network. This includes:
1. PyTorch Distributed communications
2. KV cache transfer communications
3. Tensor, Pipeline, and Data parallel communications
### Configuration Options for Inter-Node Communications
The following options control inter-node communications in vLLM:
1. **Environment Variables:**
- `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on
2. **KV Cache Transfer Configuration:**
- `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1)
- `--kv-port`: The port for KV cache transfer communications (default: 14579)
3. **Data Parallel Configuration:**
- `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1)
- `data_parallel_master_port`: Port of the data parallel master (default: 29500)
### Notes on PyTorch Distributed
vLLM uses PyTorch's distributed features for some inter-node communication. For
detailed information about PyTorch Distributed security considerations, please
refer to the [PyTorch Security
Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).
Key points from the PyTorch security guide:
- PyTorch Distributed features are intended for internal communication only
- They are not built for use in untrusted environments or networks
- No authorization protocol is included for performance reasons
- Messages are sent unencrypted
- Connections are accepted from anywhere without checks
### Security Recommendations
1. **Network Isolation:**
- Deploy vLLM nodes on a dedicated, isolated network
- Use network segmentation to prevent unauthorized access
- Implement appropriate firewall rules
2. **Configuration Best Practices:**
- Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults
- Configure firewalls to only allow necessary ports between nodes
3. **Access Control:**
- Restrict physical and network access to the deployment environment
- Implement proper authentication and authorization for management interfaces
- Follow the principle of least privilege for all system components
## Reporting Security Vulnerabilities
If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md).
...@@ -47,7 +47,7 @@ Moreover, since the tokenized text has not passed through the HF processor, we h ...@@ -47,7 +47,7 @@ Moreover, since the tokenized text has not passed through the HF processor, we h
### Dummy text ### Dummy text
We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data. We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
(mm-automatic-prompt-updating)= (mm-automatic-prompt-updating)=
......
...@@ -66,8 +66,8 @@ vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_ ...@@ -66,8 +66,8 @@ vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_
The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important: The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
- `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds
- `vllm:prompt_tokens_total` - Prompt Tokens/Sec - `vllm:prompt_tokens_total` - Prompt Tokens
- `vllm:generation_tokens_total` - Generation Tokens/Sec - `vllm:generation_tokens_total` - Generation Tokens
- `vllm:time_per_output_token_seconds` - Inter token latency (Time Per Output Token, TPOT) in second. - `vllm:time_per_output_token_seconds` - Inter token latency (Time Per Output Token, TPOT) in second.
- `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds. - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in RUNNING, WAITING, and SWAPPED state - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in RUNNING, WAITING, and SWAPPED state
...@@ -86,6 +86,17 @@ See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful b ...@@ -86,6 +86,17 @@ See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful b
Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs. Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
```bash
$ curl http://0.0.0.0:8000/metrics 2>/dev/null | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
http_requests_total{handler="/v1/completions",method="POST",status="2xx"} 201.0
http_request_size_bytes_count{handler="/v1/completions"} 201.0
http_response_size_bytes_count{handler="/v1/completions"} 201.0
http_request_duration_highr_seconds_count 201.0
http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0
```
### Multi-process Mode ### Multi-process Mode
In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>. In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
......
...@@ -99,7 +99,7 @@ This time, Inductor compilation is completely bypassed, and we will load from di ...@@ -99,7 +99,7 @@ This time, Inductor compilation is completely bypassed, and we will load from di
The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example: The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"` `vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel. Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
...@@ -134,6 +134,6 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh ...@@ -134,6 +134,6 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`: By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"` `vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture. Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
...@@ -21,11 +21,11 @@ Disaggregated prefill DOES NOT improve throughput. ...@@ -21,11 +21,11 @@ Disaggregated prefill DOES NOT improve throughput.
## Usage example ## Usage example
Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling. Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
## Benchmarks ## Benchmarks
Please refer to `benchmarks/disagg_benchmarks/` for disaggregated prefilling benchmarks. Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
## Development ## Development
......
...@@ -106,19 +106,18 @@ curl http://localhost:8000/v1/completions \ ...@@ -106,19 +106,18 @@ curl http://localhost:8000/v1/completions \
## Dynamically serving LoRA Adapters ## Dynamically serving LoRA Adapters
In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed.
LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
to change models on-the-fly is needed.
Note: Enabling this feature in production environments is risky as users may participate in model adapter management. Note: Enabling this feature in production environments is risky as users may participate in model adapter management.
To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` To enable dynamic LoRA configuration, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active. is set to `True`.
```bash ```bash
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
``` ```
### Using API Endpoints
Loading a LoRA Adapter: Loading a LoRA Adapter:
To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
...@@ -153,6 +152,58 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \ ...@@ -153,6 +152,58 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \
}' }'
``` ```
### Using Plugins
Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter.
You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
You can either install existing plugins or implement your own.
Steps to implement your own LoRAResolver plugin:
1. Implement the LoRAResolver interface.
Example of a simple S3 LoRAResolver implementation:
```python
import os
import s3fs
from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver
class S3LoRAResolver(LoRAResolver):
def __init__(self):
self.s3 = s3fs.S3FileSystem()
self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
async def resolve_lora(self, base_model_name, lora_name):
s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
# Download the LoRA from S3 to the local path
await self.s3._get(
s3_path, local_path, recursive=True, maxdepth=1
)
lora_request = LoRARequest(
lora_name=lora_name,
lora_path=local_path,
lora_int_id=abs(hash(lora_name))
)
return lora_request
```
2. Register LoRAResolver plugin.
```python
from vllm.lora.resolver import LoRAResolverRegistry
s3_resolver = S3LoRAResolver()
LoRAResolverRegistry.register_resolver("s3_resolver", s3_resolver)
```
For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md).
## New format for `--lora-modules` ## New format for `--lora-modules`
In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example: In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
......
...@@ -6,13 +6,13 @@ To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github ...@@ -6,13 +6,13 @@ To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github
Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint. Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
The main benefits are lower latency and memory usage. The main benefits are lower latency and memory usage.
You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq). You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?search=awq).
```console ```console
pip install autoawq pip install autoawq
``` ```
After installing AutoAWQ, you are ready to quantize a model. Please refer to the `AutoAWQ documentation <https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization>`_ for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
```python ```python
from awq import AutoAWQForCausalLM from awq import AutoAWQForCausalLM
......
(bitblas)=
# BitBLAS
vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
:::{note}
Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html).
:::
Below are the steps to utilize BitBLAS with vLLM.
```console
pip install bitblas>=0.1.0
```
vLLM reads the model's config file and supports pre-quantized checkpoints.
You can find pre-quantized models on:
- [Hugging Face (BitBLAS)](https://huggingface.co/models?search=bitblas)
- [Hugging Face (GPTQ)](https://huggingface.co/models?search=gptq)
Usually, these repositories have a `quantize_config.json` file that includes a `quantization_config` section.
## Read bitblas format checkpoint
```python
from vllm import LLM
import torch
# "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint.
model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas"
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas")
```
## Read gptq format checkpoint
```python
from vllm import LLM
import torch
# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
model_id = "hxbgsyxh/llama-13b-4bit-g-1"
llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024)
```
...@@ -14,7 +14,7 @@ pip install bitsandbytes>=0.45.3 ...@@ -14,7 +14,7 @@ pip install bitsandbytes>=0.45.3
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
You can find bitsandbytes quantized models on <https://huggingface.co/models?other=bitsandbytes>. You can find bitsandbytes quantized models on <https://huggingface.co/models?search=bitsandbytes>.
And usually, these repositories have a config.json file that includes a quantization_config section. And usually, these repositories have a config.json file that includes a quantization_config section.
## Read quantized checkpoint ## Read quantized checkpoint
......
...@@ -16,12 +16,16 @@ GPTQModel is one of the few quantization toolkits in the world that allows `Dyna ...@@ -16,12 +16,16 @@ GPTQModel is one of the few quantization toolkits in the world that allows `Dyna
is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override) is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override)
for more details on this and other advanced features. for more details on this and other advanced features.
You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?sort=trending&search=gptq). ## Installation
You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?search=gptq).
```console ```console
pip install -U gptqmodel --no-build-isolation -v pip install -U gptqmodel --no-build-isolation -v
``` ```
## Quantizing a model
After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details. After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details.
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
...@@ -49,12 +53,16 @@ model.quantize(calibration_dataset, batch_size=2) ...@@ -49,12 +53,16 @@ model.quantize(calibration_dataset, batch_size=2)
model.save(quant_path) model.save(quant_path)
``` ```
## Running a quantized model with vLLM
To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command: To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
```console ```console
python examples/offline_inference/llm_engine_example.py --model DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2 python examples/offline_inference/llm_engine_example.py --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
``` ```
## Using GPTQModel with vLLM's Python API
GPTQModel quantized models are also supported directly through the LLM entrypoint: GPTQModel quantized models are also supported directly through the LLM entrypoint:
```python ```python
...@@ -67,17 +75,22 @@ prompts = [ ...@@ -67,17 +75,22 @@ prompts = [
"The capital of France is", "The capital of France is",
"The future of AI is", "The future of AI is",
] ]
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.6, top_p=0.9) sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
# Create an LLM. # Create an LLM.
llm = LLM(model="DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2") llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
# Print the outputs. # Print the outputs.
print("-"*50)
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-"*50)
``` ```
...@@ -11,6 +11,7 @@ Quantization trades off model precision for smaller memory footprint, allowing l ...@@ -11,6 +11,7 @@ Quantization trades off model precision for smaller memory footprint, allowing l
supported_hardware supported_hardware
auto_awq auto_awq
bnb bnb
bitblas
gguf gguf
gptqmodel gptqmodel
int4 int4
......
...@@ -74,6 +74,17 @@ The table below shows the compatibility of various quantization implementations ...@@ -74,6 +74,17 @@ The table below shows the compatibility of various quantization implementations
* *
* *
* *
- * BitBLAS (GPTQ)
* ✅︎
* ✅︎
* ✅︎
* ✅︎
* ✅︎
* ✅︎
*
*
*
*
- * AQLM - * AQLM
* ✅︎ * ✅︎
* ✅︎ * ✅︎
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment