Unverified Commit f17aec0d authored by Reid's avatar Reid Committed by GitHub
Browse files

[doc] Fold long code blocks to improve readability (#19926)


Signed-off-by: default avatarreidliu41 <reid201711@gmail.com>
Co-authored-by: default avatarreidliu41 <reid201711@gmail.com>
parent 493c2753
...@@ -448,9 +448,11 @@ elements of the entire head for all context tokens. However, overall, ...@@ -448,9 +448,11 @@ elements of the entire head for all context tokens. However, overall,
all results for output have been calculated but are just stored in all results for output have been calculated but are just stored in
different thread register memory. different thread register memory.
```cpp ??? Code
float* out_smem = reinterpret_cast<float*>(shared_mem);
for (int i = NUM_WARPS; i > 1; i /= 2) { ```cpp
float* out_smem = reinterpret_cast<float*>(shared_mem);
for (int i = NUM_WARPS; i > 1; i /= 2) {
// Upper warps write to shared memory. // Upper warps write to shared memory.
... ...
float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
...@@ -467,8 +469,8 @@ for (int i = NUM_WARPS; i > 1; i /= 2) { ...@@ -467,8 +469,8 @@ for (int i = NUM_WARPS; i > 1; i /= 2) {
} }
// Write out the accs. // Write out the accs.
} }
``` ```
## Output ## Output
......
...@@ -13,11 +13,13 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture ( ...@@ -13,11 +13,13 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (
vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
```python ??? Code
# inside `setup.py` file
from setuptools import setup
setup(name='vllm_add_dummy_model', ```python
# inside `setup.py` file
from setuptools import setup
setup(name='vllm_add_dummy_model',
version='0.1', version='0.1',
packages=['vllm_add_dummy_model'], packages=['vllm_add_dummy_model'],
entry_points={ entry_points={
...@@ -25,8 +27,8 @@ setup(name='vllm_add_dummy_model', ...@@ -25,8 +27,8 @@ setup(name='vllm_add_dummy_model',
["register_dummy_model = vllm_add_dummy_model:register"] ["register_dummy_model = vllm_add_dummy_model:register"]
}) })
# inside `vllm_add_dummy_model.py` file # inside `vllm_add_dummy_model.py` file
def register(): def register():
from vllm import ModelRegistry from vllm import ModelRegistry
if "MyLlava" not in ModelRegistry.get_supported_archs(): if "MyLlava" not in ModelRegistry.get_supported_archs():
...@@ -34,7 +36,7 @@ def register(): ...@@ -34,7 +36,7 @@ def register():
"MyLlava", "MyLlava",
"vllm_add_dummy_model.my_llava:MyLlava", "vllm_add_dummy_model.my_llava:MyLlava",
) )
``` ```
For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
......
...@@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa ...@@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
the third parameter is the path to the LoRA adapter. the third parameter is the path to the LoRA adapter.
```python ??? Code
sampling_params = SamplingParams(
```python
sampling_params = SamplingParams(
temperature=0, temperature=0,
max_tokens=256, max_tokens=256,
stop=["[/assistant]"] stop=["[/assistant]"]
) )
prompts = [ prompts = [
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
] ]
outputs = llm.generate( outputs = llm.generate(
prompts, prompts,
sampling_params, sampling_params,
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
) )
``` ```
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
...@@ -68,9 +70,11 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora ...@@ -68,9 +70,11 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.): with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
```bash ??? Command
curl localhost:8000/v1/models | jq .
{ ```bash
curl localhost:8000/v1/models | jq .
{
"object": "list", "object": "list",
"data": [ "data": [
{ {
...@@ -84,8 +88,8 @@ curl localhost:8000/v1/models | jq . ...@@ -84,8 +88,8 @@ curl localhost:8000/v1/models | jq .
... ...
} }
] ]
} }
``` ```
Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
...@@ -168,7 +172,7 @@ Alternatively, follow these example steps to implement your own plugin: ...@@ -168,7 +172,7 @@ Alternatively, follow these example steps to implement your own plugin:
1. Implement the LoRAResolver interface. 1. Implement the LoRAResolver interface.
Example of a simple S3 LoRAResolver implementation: ??? Example of a simple S3 LoRAResolver implementation
```python ```python
import os import os
...@@ -234,10 +238,12 @@ The new format of `--lora-modules` is mainly to support the display of parent mo ...@@ -234,10 +238,12 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
- The `root` field points to the artifact location of the lora adapter. - The `root` field points to the artifact location of the lora adapter.
```bash ??? Command output
$ curl http://localhost:8000/v1/models
{ ```bash
$ curl http://localhost:8000/v1/models
{
"object": "list", "object": "list",
"data": [ "data": [
{ {
...@@ -267,5 +273,5 @@ $ curl http://localhost:8000/v1/models ...@@ -267,5 +273,5 @@ $ curl http://localhost:8000/v1/models
] ]
} }
] ]
} }
``` ```
...@@ -20,31 +20,33 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]: ...@@ -20,31 +20,33 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
```python ??? Code
from vllm import LLM
llm = LLM(model="llava-hf/llava-1.5-7b-hf") ```python
from vllm import LLM
# Refer to the HuggingFace repo for the correct format to use llm = LLM(model="llava-hf/llava-1.5-7b-hf")
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
# Load the image using PIL.Image # Refer to the HuggingFace repo for the correct format to use
image = PIL.Image.open(...) prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
# Single prompt inference # Load the image using PIL.Image
outputs = llm.generate({ image = PIL.Image.open(...)
# Single prompt inference
outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
"multi_modal_data": {"image": image}, "multi_modal_data": {"image": image},
}) })
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
# Batch inference # Batch inference
image_1 = PIL.Image.open(...) image_1 = PIL.Image.open(...)
image_2 = PIL.Image.open(...) image_2 = PIL.Image.open(...)
outputs = llm.generate( outputs = llm.generate(
[ [
{ {
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:", "prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
...@@ -55,76 +57,80 @@ outputs = llm.generate( ...@@ -55,76 +57,80 @@ outputs = llm.generate(
"multi_modal_data": {"image": image_2}, "multi_modal_data": {"image": image_2},
} }
] ]
) )
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
``` ```
Full example: <gh-file:examples/offline_inference/vision_language.py> Full example: <gh-file:examples/offline_inference/vision_language.py>
To substitute multiple images inside the same text prompt, you can pass in a list of images instead: To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
```python ??? Code
from vllm import LLM
llm = LLM( ```python
from vllm import LLM
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True, # Required to load Phi-3.5-vision trust_remote_code=True, # Required to load Phi-3.5-vision
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
limit_mm_per_prompt={"image": 2}, # The maximum number to accept limit_mm_per_prompt={"image": 2}, # The maximum number to accept
) )
# Refer to the HuggingFace repo for the correct format to use # Refer to the HuggingFace repo for the correct format to use
prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
# Load the images using PIL.Image # Load the images using PIL.Image
image1 = PIL.Image.open(...) image1 = PIL.Image.open(...)
image2 = PIL.Image.open(...) image2 = PIL.Image.open(...)
outputs = llm.generate({ outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
"multi_modal_data": { "multi_modal_data": {
"image": [image1, image2] "image": [image1, image2]
}, },
}) })
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
``` ```
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py> Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
```python ??? Code
from vllm import LLM
```python
from vllm import LLM
# Specify the maximum number of frames per video to be 4. This can be changed. # Specify the maximum number of frames per video to be 4. This can be changed.
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
# Create the request payload. # Create the request payload.
video_frames = ... # load your video making sure it only has the number of frames specified earlier. video_frames = ... # load your video making sure it only has the number of frames specified earlier.
message = { message = {
"role": "user", "role": "user",
"content": [ "content": [
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
], ],
} }
for i in range(len(video_frames)): for i in range(len(video_frames)):
base64_image = encode_image(video_frames[i]) # base64 encoding. base64_image = encode_image(video_frames[i]) # base64 encoding.
new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
message["content"].append(new_image) message["content"].append(new_image)
# Perform inference and log output. # Perform inference and log output.
outputs = llm.chat([message]) outputs = llm.chat([message])
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
``` ```
### Video Inputs ### Video Inputs
...@@ -144,68 +150,72 @@ Full example: <gh-file:examples/offline_inference/audio_language.py> ...@@ -144,68 +150,72 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
```python ??? Code
from vllm import LLM
# Inference with image embeddings as input ```python
llm = LLM(model="llava-hf/llava-1.5-7b-hf") from vllm import LLM
# Refer to the HuggingFace repo for the correct format to use # Inference with image embeddings as input
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:" llm = LLM(model="llava-hf/llava-1.5-7b-hf")
# Embeddings for single image # Refer to the HuggingFace repo for the correct format to use
# torch.Tensor of shape (1, image_feature_size, hidden_size of LM) prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
image_embeds = torch.load(...)
outputs = llm.generate({ # Embeddings for single image
# torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
image_embeds = torch.load(...)
outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
"multi_modal_data": {"image": image_embeds}, "multi_modal_data": {"image": image_embeds},
}) })
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
``` ```
For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
```python ??? Code
# Construct the prompt based on your model
prompt = ...
# Embeddings for multiple images ```python
# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) # Construct the prompt based on your model
image_embeds = torch.load(...) prompt = ...
# Qwen2-VL # Embeddings for multiple images
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
mm_data = { image_embeds = torch.load(...)
# Qwen2-VL
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
mm_data = {
"image": { "image": {
"image_embeds": image_embeds, "image_embeds": image_embeds,
# image_grid_thw is needed to calculate positional encoding. # image_grid_thw is needed to calculate positional encoding.
"image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3),
} }
} }
# MiniCPM-V # MiniCPM-V
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
mm_data = { mm_data = {
"image": { "image": {
"image_embeds": image_embeds, "image_embeds": image_embeds,
# image_sizes is needed to calculate details of the sliced image. # image_sizes is needed to calculate details of the sliced image.
"image_sizes": [image.size for image in images], # list of image sizes "image_sizes": [image.size for image in images], # list of image sizes
} }
} }
outputs = llm.generate({ outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
"multi_modal_data": mm_data, "multi_modal_data": mm_data,
}) })
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
``` ```
## Online Serving ## Online Serving
...@@ -235,21 +245,23 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ ...@@ -235,21 +245,23 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
```python ??? Code
from openai import OpenAI
```python
from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI( client = OpenAI(
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
# Single-image input inference # Single-image input inference
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
chat_response = client.chat.completions.create( chat_response = client.chat.completions.create(
model="microsoft/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
messages=[{ messages=[{
"role": "user", "role": "user",
...@@ -260,14 +272,14 @@ chat_response = client.chat.completions.create( ...@@ -260,14 +272,14 @@ chat_response = client.chat.completions.create(
{"type": "image_url", "image_url": {"url": image_url}}, {"type": "image_url", "image_url": {"url": image_url}},
], ],
}], }],
) )
print("Chat completion output:", chat_response.choices[0].message.content) print("Chat completion output:", chat_response.choices[0].message.content)
# Multi-image input inference # Multi-image input inference
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
chat_response = client.chat.completions.create( chat_response = client.chat.completions.create(
model="microsoft/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
messages=[{ messages=[{
"role": "user", "role": "user",
...@@ -277,9 +289,9 @@ chat_response = client.chat.completions.create( ...@@ -277,9 +289,9 @@ chat_response = client.chat.completions.create(
{"type": "image_url", "image_url": {"url": image_url_lion}}, {"type": "image_url", "image_url": {"url": image_url_lion}},
], ],
}], }],
) )
print("Chat completion output:", chat_response.choices[0].message.content) print("Chat completion output:", chat_response.choices[0].message.content)
``` ```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
...@@ -311,21 +323,23 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model ...@@ -311,21 +323,23 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
```python ??? Code
from openai import OpenAI
```python
from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI( client = OpenAI(
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
## Use video url in the payload ## Use video url in the payload
chat_completion_from_url = client.chat.completions.create( chat_completion_from_url = client.chat.completions.create(
messages=[{ messages=[{
"role": "role":
"user", "user",
...@@ -344,11 +358,11 @@ chat_completion_from_url = client.chat.completions.create( ...@@ -344,11 +358,11 @@ chat_completion_from_url = client.chat.completions.create(
}], }],
model=model, model=model,
max_completion_tokens=64, max_completion_tokens=64,
) )
result = chat_completion_from_url.choices[0].message.content result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result) print("Chat completion output from image url:", result)
``` ```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
...@@ -373,13 +387,15 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b ...@@ -373,13 +387,15 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
```python ??? Code
import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
def encode_base64_content_from_url(content_url: str) -> str: ```python
import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
def encode_base64_content_from_url(content_url: str) -> str:
"""Encode a content retrieved from a remote url to base64 format.""" """Encode a content retrieved from a remote url to base64 format."""
with requests.get(content_url) as response: with requests.get(content_url) as response:
...@@ -388,19 +404,19 @@ def encode_base64_content_from_url(content_url: str) -> str: ...@@ -388,19 +404,19 @@ def encode_base64_content_from_url(content_url: str) -> str:
return result return result
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI( client = OpenAI(
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
# Any format supported by librosa is supported # Any format supported by librosa is supported
audio_url = AudioAsset("winning_call").url audio_url = AudioAsset("winning_call").url
audio_base64 = encode_base64_content_from_url(audio_url) audio_base64 = encode_base64_content_from_url(audio_url)
chat_completion_from_base64 = client.chat.completions.create( chat_completion_from_base64 = client.chat.completions.create(
messages=[{ messages=[{
"role": "user", "role": "user",
"content": [ "content": [
...@@ -419,16 +435,18 @@ chat_completion_from_base64 = client.chat.completions.create( ...@@ -419,16 +435,18 @@ chat_completion_from_base64 = client.chat.completions.create(
}], }],
model=model, model=model,
max_completion_tokens=64, max_completion_tokens=64,
) )
result = chat_completion_from_base64.choices[0].message.content result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from input audio:", result) print("Chat completion output from input audio:", result)
``` ```
Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input: Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
```python ??? Code
chat_completion_from_url = client.chat.completions.create(
```python
chat_completion_from_url = client.chat.completions.create(
messages=[{ messages=[{
"role": "user", "role": "user",
"content": [ "content": [
...@@ -446,11 +464,11 @@ chat_completion_from_url = client.chat.completions.create( ...@@ -446,11 +464,11 @@ chat_completion_from_url = client.chat.completions.create(
}], }],
model=model, model=model,
max_completion_tokens=64, max_completion_tokens=64,
) )
result = chat_completion_from_url.choices[0].message.content result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from audio url:", result) print("Chat completion output from audio url:", result)
``` ```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
...@@ -470,47 +488,49 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary. ...@@ -470,47 +488,49 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary.
For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field. For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
The following example demonstrates how to pass image embeddings to the OpenAI server: The following example demonstrates how to pass image embeddings to the OpenAI server:
```python ??? Code
image_embedding = torch.load(...)
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct ```python
image_embedding = torch.load(...)
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
buffer = io.BytesIO() buffer = io.BytesIO()
torch.save(image_embedding, buffer) torch.save(image_embedding, buffer)
buffer.seek(0) buffer.seek(0)
binary_data = buffer.read() binary_data = buffer.read()
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8') base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
client = OpenAI( client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY") # defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
# Basic usage - this is equivalent to the LLaVA example for offline inference # Basic usage - this is equivalent to the LLaVA example for offline inference
model = "llava-hf/llava-1.5-7b-hf" model = "llava-hf/llava-1.5-7b-hf"
embeds = { embeds = {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": f"{base64_image_embedding}" "image_embeds": f"{base64_image_embedding}"
} }
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V) # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
model = "Qwen/Qwen2-VL-2B-Instruct" model = "Qwen/Qwen2-VL-2B-Instruct"
embeds = { embeds = {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": { "image_embeds": {
"image_embeds": f"{base64_image_embedding}" , # Required "image_embeds": f"{base64_image_embedding}" , # Required
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct "image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
}, },
} }
model = "openbmb/MiniCPM-V-2_6" model = "openbmb/MiniCPM-V-2_6"
embeds = { embeds = {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": { "image_embeds": {
"image_embeds": f"{base64_image_embedding}" , # Required "image_embeds": f"{base64_image_embedding}" , # Required
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6 "image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
}, },
} }
chat_completion = client.chat.completions.create( chat_completion = client.chat.completions.create(
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [ {"role": "user", "content": [
...@@ -521,10 +541,10 @@ chat_completion = client.chat.completions.create( ...@@ -521,10 +541,10 @@ chat_completion = client.chat.completions.create(
embeds, embeds,
], ],
}, },
], ],
model=model, model=model,
) )
``` ```
!!! note !!! note
Only one message can contain `{"type": "image_embeds"}`. Only one message can contain `{"type": "image_embeds"}`.
......
...@@ -15,29 +15,31 @@ pip install autoawq ...@@ -15,29 +15,31 @@ pip install autoawq
After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
```python ??? Code
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_path = 'mistralai/Mistral-7B-Instruct-v0.2' ```python
quant_path = 'mistral-instruct-v0.2-awq' from awq import AutoAWQForCausalLM
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } from transformers import AutoTokenizer
# Load model model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
model = AutoAWQForCausalLM.from_pretrained( quant_path = 'mistral-instruct-v0.2-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
# Load model
model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False} model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
) )
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Quantize # Quantize
model.quantize(tokenizer, quant_config=quant_config) model.quantize(tokenizer, quant_config=quant_config)
# Save quantized model # Save quantized model
model.save_quantized(quant_path) model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path) tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"') print(f'Model is quantized and saved at "{quant_path}"')
``` ```
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
...@@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \ ...@@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \
AWQ models are also supported directly through the LLM entrypoint: AWQ models are also supported directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
# Sample prompts. ```python
prompts = [ from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"Hello, my name is", "Hello, my name is",
"The president of the United States is", "The president of the United States is",
"The capital of France is", "The capital of France is",
"The future of AI is", "The future of AI is",
] ]
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Create an LLM.
llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
# Print the outputs. # Print the outputs.
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` ```
...@@ -43,17 +43,19 @@ llm = LLM( ...@@ -43,17 +43,19 @@ llm = LLM(
## Read gptq format checkpoint ## Read gptq format checkpoint
```python ??? Code
from vllm import LLM
import torch
# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint. ```python
model_id = "hxbgsyxh/llama-13b-4bit-g-1" from vllm import LLM
llm = LLM( import torch
# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
model_id = "hxbgsyxh/llama-13b-4bit-g-1"
llm = LLM(
model=model_id, model=model_id,
dtype=torch.float16, dtype=torch.float16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitblas", quantization="bitblas",
max_model_len=1024 max_model_len=1024
) )
``` ```
...@@ -58,22 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r ...@@ -58,22 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r
Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
```python ??? Code
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier ```python
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
# Configure the simple PTQ quantization # Configure the simple PTQ quantization
recipe = QuantizationModifier( recipe = QuantizationModifier(
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
# Apply the quantization algorithm. # Apply the quantization algorithm.
oneshot(model=model, recipe=recipe) oneshot(model=model, recipe=recipe)
# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
model.save_pretrained(SAVE_DIR) model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR) tokenizer.save_pretrained(SAVE_DIR)
``` ```
### 3. Evaluating Accuracy ### 3. Evaluating Accuracy
......
...@@ -41,11 +41,13 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ ...@@ -41,11 +41,13 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
You can also use the GGUF model directly through the LLM entrypoint: You can also use the GGUF model directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
# In this script, we demonstrate how to pass input to the chat method: ```python
conversation = [ from vllm import LLM, SamplingParams
# In this script, we demonstrate how to pass input to the chat method:
conversation = [
{ {
"role": "system", "role": "system",
"content": "You are a helpful assistant" "content": "You are a helpful assistant"
...@@ -62,21 +64,21 @@ conversation = [ ...@@ -62,21 +64,21 @@ conversation = [
"role": "user", "role": "user",
"content": "Write an essay about the importance of higher education.", "content": "Write an essay about the importance of higher education.",
}, },
] ]
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Create an LLM.
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.chat(conversation, sampling_params) outputs = llm.chat(conversation, sampling_params)
# Print the outputs. # Print the outputs.
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` ```
...@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t ...@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
```python ??? Code
from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
model_id = "meta-llama/Llama-3.2-1B-Instruct" ```python
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit" from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
calibration_dataset = load_dataset( model_id = "meta-llama/Llama-3.2-1B-Instruct"
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
calibration_dataset = load_dataset(
"allenai/c4", "allenai/c4",
data_files="en/c4-train.00001-of-01024.json.gz", data_files="en/c4-train.00001-of-01024.json.gz",
split="train" split="train"
).select(range(1024))["text"] ).select(range(1024))["text"]
quant_config = QuantizeConfig(bits=4, group_size=128) quant_config = QuantizeConfig(bits=4, group_size=128)
model = GPTQModel.load(model_id, quant_config) model = GPTQModel.load(model_id, quant_config)
# increase `batch_size` to match gpu/vram specs to speed up quantization # increase `batch_size` to match gpu/vram specs to speed up quantization
model.quantize(calibration_dataset, batch_size=2) model.quantize(calibration_dataset, batch_size=2)
model.save(quant_path) model.save(quant_path)
``` ```
## Running a quantized model with vLLM ## Running a quantized model with vLLM
...@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \ ...@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \
GPTQModel quantized models are also supported directly through the LLM entrypoint: GPTQModel quantized models are also supported directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
```python
from vllm import LLM, SamplingParams
# Sample prompts. # Sample prompts.
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
"The president of the United States is", "The president of the United States is",
"The capital of France is", "The capital of France is",
"The future of AI is", "The future of AI is",
] ]
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.6, top_p=0.9) sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
# Create an LLM. # Create an LLM.
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2") llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
# Print the outputs. # Print the outputs.
print("-"*50) print("-"*50)
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-"*50) print("-"*50)
``` ```
...@@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd ...@@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
It's best to use calibration data that closely matches your deployment data. It's best to use calibration data that closely matches your deployment data.
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
```python ??? Code
from datasets import load_dataset
```python
from datasets import load_dataset
NUM_CALIBRATION_SAMPLES = 512 NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048 MAX_SEQUENCE_LENGTH = 2048
# Load and preprocess the dataset # Load and preprocess the dataset
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def preprocess(example): def preprocess(example):
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
ds = ds.map(preprocess) ds = ds.map(preprocess)
def tokenize(sample): def tokenize(sample):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names) ds = ds.map(tokenize, remove_columns=ds.column_names)
``` ```
### 3. Applying Quantization ### 3. Applying Quantization
Now, apply the quantization algorithms: Now, apply the quantization algorithms:
```python ??? Code
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier ```python
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
# Configure the quantization algorithms # Configure the quantization algorithms
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]) recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
# Apply quantization # Apply quantization
oneshot( oneshot(
model=model, model=model,
dataset=ds, dataset=ds,
recipe=recipe, recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH, max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES, num_calibration_samples=NUM_CALIBRATION_SAMPLES,
) )
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128 # Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True) model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR) tokenizer.save_pretrained(SAVE_DIR)
``` ```
This process creates a W4A16 model with weights quantized to 4-bit integers. This process creates a W4A16 model with weights quantized to 4-bit integers.
...@@ -137,14 +141,16 @@ $ lm_eval --model vllm \ ...@@ -137,14 +141,16 @@ $ lm_eval --model vllm \
The following is an example of an expanded quantization recipe you can tune to your own use case: The following is an example of an expanded quantization recipe you can tune to your own use case:
```python ??? Code
from compressed_tensors.quantization import (
```python
from compressed_tensors.quantization import (
QuantizationArgs, QuantizationArgs,
QuantizationScheme, QuantizationScheme,
QuantizationStrategy, QuantizationStrategy,
QuantizationType, QuantizationType,
) )
recipe = GPTQModifier( recipe = GPTQModifier(
targets="Linear", targets="Linear",
config_groups={ config_groups={
"config_group": QuantizationScheme( "config_group": QuantizationScheme(
...@@ -163,8 +169,8 @@ recipe = GPTQModifier( ...@@ -163,8 +169,8 @@ recipe = GPTQModifier(
ignore=["lm_head"], ignore=["lm_head"],
update_size=NUM_CALIBRATION_SAMPLES, update_size=NUM_CALIBRATION_SAMPLES,
dampening_frac=0.01 dampening_frac=0.01
) )
``` ```
## Troubleshooting and Support ## Troubleshooting and Support
......
...@@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa ...@@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa
It's best to use calibration data that closely matches your deployment data. It's best to use calibration data that closely matches your deployment data.
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
```python ??? Code
from datasets import load_dataset
```python
from datasets import load_dataset
NUM_CALIBRATION_SAMPLES = 512 NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048 MAX_SEQUENCE_LENGTH = 2048
# Load and preprocess the dataset # Load and preprocess the dataset
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def preprocess(example): def preprocess(example):
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
ds = ds.map(preprocess) ds = ds.map(preprocess)
def tokenize(sample): def tokenize(sample):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names) ds = ds.map(tokenize, remove_columns=ds.column_names)
``` ```
</details>
### 3. Applying Quantization ### 3. Applying Quantization
Now, apply the quantization algorithms: Now, apply the quantization algorithms:
```python ??? Code
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier ```python
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
# Configure the quantization algorithms # Configure the quantization algorithms
recipe = [ recipe = [
SmoothQuantModifier(smoothing_strength=0.8), SmoothQuantModifier(smoothing_strength=0.8),
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
] ]
# Apply quantization # Apply quantization
oneshot( oneshot(
model=model, model=model,
dataset=ds, dataset=ds,
recipe=recipe, recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH, max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES, num_calibration_samples=NUM_CALIBRATION_SAMPLES,
) )
# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
model.save_pretrained(SAVE_DIR, save_compressed=True) model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR) tokenizer.save_pretrained(SAVE_DIR)
``` ```
This process creates a W8A8 model with weights and activations quantized to 8-bit integers. This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
......
...@@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te ...@@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te
Below is an example showing how to quantize a model using modelopt's PTQ API: Below is an example showing how to quantize a model using modelopt's PTQ API:
```python ??? Code
import modelopt.torch.quantization as mtq
from transformers import AutoModelForCausalLM ```python
import modelopt.torch.quantization as mtq
from transformers import AutoModelForCausalLM
# Load the model from HuggingFace # Load the model from HuggingFace
model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>") model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
# Select the quantization config, for example, FP8 # Select the quantization config, for example, FP8
config = mtq.FP8_DEFAULT_CFG config = mtq.FP8_DEFAULT_CFG
# Define a forward loop function for calibration # Define a forward loop function for calibration
def forward_loop(model): def forward_loop(model):
for data in calib_set: for data in calib_set:
model(data) model(data)
# PTQ with in-place replacement of quantized modules # PTQ with in-place replacement of quantized modules
model = mtq.quantize(model, config, forward_loop) model = mtq.quantize(model, config, forward_loop)
``` ```
After the model is quantized, you can export it to a quantized checkpoint using the export API: After the model is quantized, you can export it to a quantized checkpoint using the export API:
...@@ -48,10 +50,12 @@ with torch.inference_mode(): ...@@ -48,10 +50,12 @@ with torch.inference_mode():
The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM: The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
```python ??? Code
from vllm import LLM, SamplingParams
```python
from vllm import LLM, SamplingParams
def main(): def main():
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
...@@ -73,6 +77,6 @@ def main(): ...@@ -73,6 +77,6 @@ def main():
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()
``` ```
...@@ -35,20 +35,22 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades ...@@ -35,20 +35,22 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades
Here is an example of how to enable FP8 quantization: Here is an example of how to enable FP8 quantization:
```python ??? Code
# To calculate kv cache scales on the fly enable the calculate_kv_scales
# parameter
from vllm import LLM, SamplingParams ```python
# To calculate kv cache scales on the fly enable the calculate_kv_scales
# parameter
sampling_params = SamplingParams(temperature=0.7, top_p=0.8) from vllm import LLM, SamplingParams
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
kv_cache_dtype="fp8", kv_cache_dtype="fp8",
calculate_kv_scales=True) calculate_kv_scales=True)
prompt = "London is the capital of" prompt = "London is the capital of"
out = llm.generate(prompt, sampling_params)[0].outputs[0].text out = llm.generate(prompt, sampling_params)[0].outputs[0].text
print(out) print(out)
``` ```
The `kv_cache_dtype` argument specifies the data type for KV cache storage: The `kv_cache_dtype` argument specifies the data type for KV cache storage:
- `"auto"`: Uses the model's default "unquantized" data type - `"auto"`: Uses the model's default "unquantized" data type
...@@ -71,29 +73,31 @@ pip install llmcompressor ...@@ -71,29 +73,31 @@ pip install llmcompressor
Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern): Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
```python ??? Code
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor.transformers import oneshot
# Select model and load it ```python
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" from datasets import load_dataset
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) from llmcompressor.transformers import oneshot
# Select calibration dataset # Select model and load it
DATASET_ID = "HuggingFaceH4/ultrachat_200k" MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
DATASET_SPLIT = "train_sft" model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Configure calibration parameters # Select calibration dataset
NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point DATASET_ID = "HuggingFaceH4/ultrachat_200k"
MAX_SEQUENCE_LENGTH = 2048 DATASET_SPLIT = "train_sft"
# Load and preprocess dataset # Configure calibration parameters
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) MAX_SEQUENCE_LENGTH = 2048
def process_and_tokenize(example): # Load and preprocess dataset
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def process_and_tokenize(example):
text = tokenizer.apply_chat_template(example["messages"], tokenize=False) text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
return tokenizer( return tokenizer(
text, text,
...@@ -103,11 +107,11 @@ def process_and_tokenize(example): ...@@ -103,11 +107,11 @@ def process_and_tokenize(example):
add_special_tokens=False, add_special_tokens=False,
) )
ds = ds.map(process_and_tokenize, remove_columns=ds.column_names) ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
# Configure quantization settings # Configure quantization settings
recipe = """ recipe = """
quant_stage: quant_stage:
quant_modifiers: quant_modifiers:
QuantizationModifier: QuantizationModifier:
kv_cache_scheme: kv_cache_scheme:
...@@ -116,22 +120,22 @@ quant_stage: ...@@ -116,22 +120,22 @@ quant_stage:
strategy: tensor strategy: tensor
dynamic: false dynamic: false
symmetric: true symmetric: true
""" """
# Apply quantization # Apply quantization
oneshot( oneshot(
model=model, model=model,
dataset=ds, dataset=ds,
recipe=recipe, recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH, max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES, num_calibration_samples=NUM_CALIBRATION_SAMPLES,
) )
# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV # Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
model.save_pretrained(SAVE_DIR, save_compressed=True) model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR) tokenizer.save_pretrained(SAVE_DIR)
``` ```
The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales. The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.
......
...@@ -42,20 +42,22 @@ The Quark quantization process can be listed for 5 steps as below: ...@@ -42,20 +42,22 @@ The Quark quantization process can be listed for 5 steps as below:
Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index) Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
to fetch model and tokenizer. to fetch model and tokenizer.
```python ??? Code
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Llama-2-70b-chat-hf" ```python
MAX_SEQ_LEN = 512 from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained( MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
MAX_SEQ_LEN = 512
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID, device_map="auto", torch_dtype="auto",
) )
model.eval() model.eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
``` ```
### 2. Prepare the Calibration Dataloader ### 2. Prepare the Calibration Dataloader
...@@ -63,22 +65,24 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic ...@@ -63,22 +65,24 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic
to load calibration data. For more details about how to use calibration datasets efficiently, please refer to load calibration data. For more details about how to use calibration datasets efficiently, please refer
to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html). to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
```python ??? Code
from datasets import load_dataset
from torch.utils.data import DataLoader ```python
from datasets import load_dataset
from torch.utils.data import DataLoader
BATCH_SIZE = 1 BATCH_SIZE = 1
NUM_CALIBRATION_DATA = 512 NUM_CALIBRATION_DATA = 512
# Load the dataset and get calibration data. # Load the dataset and get calibration data.
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
text_data = dataset["text"][:NUM_CALIBRATION_DATA] text_data = dataset["text"][:NUM_CALIBRATION_DATA]
tokenized_outputs = tokenizer(text_data, return_tensors="pt", tokenized_outputs = tokenizer(text_data, return_tensors="pt",
padding=True, truncation=True, max_length=MAX_SEQ_LEN) padding=True, truncation=True, max_length=MAX_SEQ_LEN)
calib_dataloader = DataLoader(tokenized_outputs['input_ids'], calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE, drop_last=True) batch_size=BATCH_SIZE, drop_last=True)
``` ```
### 3. Set the Quantization Configuration ### 3. Set the Quantization Configuration
...@@ -94,42 +98,44 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. ...@@ -94,42 +98,44 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
AutoSmoothQuant config file for Llama is AutoSmoothQuant config file for Llama is
`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`. `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
```python ??? Code
from quark.torch.quantization import (Config, QuantizationConfig,
```python
from quark.torch.quantization import (Config, QuantizationConfig,
FP8E4M3PerTensorSpec, FP8E4M3PerTensorSpec,
load_quant_algo_config_from_file) load_quant_algo_config_from_file)
# Define fp8/per-tensor/static spec. # Define fp8/per-tensor/static spec.
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
is_dynamic=False).to_quantization_spec() is_dynamic=False).to_quantization_spec()
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
weight=FP8_PER_TENSOR_SPEC) weight=FP8_PER_TENSOR_SPEC)
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
kv_cache_quant_config = {name : kv_cache_quant_config = {name :
QuantizationConfig(input_tensors=global_quant_config.input_tensors, QuantizationConfig(input_tensors=global_quant_config.input_tensors,
weight=global_quant_config.weight, weight=global_quant_config.weight,
output_tensors=KV_CACHE_SPEC) output_tensors=KV_CACHE_SPEC)
for name in kv_cache_layer_names_for_llama} for name in kv_cache_layer_names_for_llama}
layer_quant_config = kv_cache_quant_config.copy() layer_quant_config = kv_cache_quant_config.copy()
# Define algorithm config by config file. # Define algorithm config by config file.
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json' 'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
EXCLUDE_LAYERS = ["lm_head"] EXCLUDE_LAYERS = ["lm_head"]
quant_config = Config( quant_config = Config(
global_quant_config=global_quant_config, global_quant_config=global_quant_config,
layer_quant_config=layer_quant_config, layer_quant_config=layer_quant_config,
kv_cache_quant_config=kv_cache_quant_config, kv_cache_quant_config=kv_cache_quant_config,
exclude=EXCLUDE_LAYERS, exclude=EXCLUDE_LAYERS,
algo_config=algo_config) algo_config=algo_config)
``` ```
### 4. Quantize the Model and Export ### 4. Quantize the Model and Export
...@@ -139,63 +145,67 @@ HuggingFace `safetensors`, you can refer to ...@@ -139,63 +145,67 @@ HuggingFace `safetensors`, you can refer to
[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html) [HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
for more exporting format details. for more exporting format details.
```python ??? Code
import torch
from quark.torch import ModelQuantizer, ModelExporter ```python
from quark.torch.export import ExporterConfig, JsonExporterConfig import torch
from quark.torch import ModelQuantizer, ModelExporter
from quark.torch.export import ExporterConfig, JsonExporterConfig
# Apply quantization. # Apply quantization.
quantizer = ModelQuantizer(quant_config) quantizer = ModelQuantizer(quant_config)
quant_model = quantizer.quantize_model(model, calib_dataloader) quant_model = quantizer.quantize_model(model, calib_dataloader)
# Freeze quantized model to export. # Freeze quantized model to export.
freezed_model = quantizer.freeze(model) freezed_model = quantizer.freeze(model)
# Define export config. # Define export config.
LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"] LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
export_config = ExporterConfig(json_export_config=JsonExporterConfig()) export_config = ExporterConfig(json_export_config=JsonExporterConfig())
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant # Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
with torch.no_grad(): with torch.no_grad():
exporter.export_safetensors_model(freezed_model, exporter.export_safetensors_model(freezed_model,
quant_config=quant_config, tokenizer=tokenizer) quant_config=quant_config, tokenizer=tokenizer)
``` ```
### 5. Evaluation in vLLM ### 5. Evaluation in vLLM
Now, you can load and run the Quark quantized model directly through the LLM entrypoint: Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
# Sample prompts. ```python
prompts = [ from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"Hello, my name is", "Hello, my name is",
"The president of the United States is", "The president of the United States is",
"The capital of France is", "The capital of France is",
"The future of AI is", "The future of AI is",
] ]
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Create an LLM.
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
kv_cache_dtype='fp8',quantization='quark') kv_cache_dtype='fp8',quantization='quark')
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
# Print the outputs. # Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60) print("\nGenerated Outputs:\n" + "-" * 60)
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}") print(f"Prompt: {prompt!r}")
print(f"Output: {generated_text!r}") print(f"Output: {generated_text!r}")
print("-" * 60) print("-" * 60)
``` ```
Or, you can use `lm_eval` to evaluate accuracy: Or, you can use `lm_eval` to evaluate accuracy:
......
...@@ -15,26 +15,28 @@ pip install \ ...@@ -15,26 +15,28 @@ pip install \
## Quantizing HuggingFace Models ## Quantizing HuggingFace Models
You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code: You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
```Python ??? Code
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer ```Python
from torchao.quantization import Int8WeightOnlyConfig import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
model_name = "meta-llama/Meta-Llama-3-8B" from torchao.quantization import Int8WeightOnlyConfig
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
quantized_model = AutoModelForCausalLM.from_pretrained( model_name = "meta-llama/Meta-Llama-3-8B"
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
quantized_model = AutoModelForCausalLM.from_pretrained(
model_name, model_name,
torch_dtype="auto", torch_dtype="auto",
device_map="auto", device_map="auto",
quantization_config=quantization_config quantization_config=quantization_config
) )
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
input_text = "What are we having for dinner?" input_text = "What are we having for dinner?"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
hub_repo = # YOUR HUB REPO ID hub_repo = # YOUR HUB REPO ID
tokenizer.push_to_hub(hub_repo) tokenizer.push_to_hub(hub_repo)
quantized_model.push_to_hub(hub_repo, safe_serialization=False) quantized_model.push_to_hub(hub_repo, safe_serialization=False)
``` ```
Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI. Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
...@@ -33,34 +33,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ ...@@ -33,34 +33,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
Next, make a request to the model that should return the reasoning content in the response. Next, make a request to the model that should return the reasoning content in the response.
```python ??? Code
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server. ```python
openai_api_key = "EMPTY" from openai import OpenAI
openai_api_base = "http://localhost:8000/v1"
client = OpenAI( # Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
models = client.models.list() models = client.models.list()
model = models.data[0].id model = models.data[0].id
# Round 1 # Round 1
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
# For Qwen3 series, if you want to disable thinking in reasoning mode, add: # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
# extra_body={"chat_template_kwargs": {"enable_thinking": False}} # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
response = client.chat.completions.create(model=model, messages=messages) response = client.chat.completions.create(model=model, messages=messages)
reasoning_content = response.choices[0].message.reasoning_content reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content content = response.choices[0].message.content
print("reasoning_content:", reasoning_content) print("reasoning_content:", reasoning_content)
print("content:", content) print("content:", content)
``` ```
The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion. The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
...@@ -68,8 +70,10 @@ The `reasoning_content` field contains the reasoning steps that led to the final ...@@ -68,8 +70,10 @@ The `reasoning_content` field contains the reasoning steps that led to the final
Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming). Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
```json ??? Json
{
```json
{
"id": "chatcmpl-123", "id": "chatcmpl-123",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"created": 1694268190, "created": 1694268190,
...@@ -86,39 +90,41 @@ Streaming chat completions are also supported for reasoning models. The `reasoni ...@@ -86,39 +90,41 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
"finish_reason": null "finish_reason": null
} }
] ]
} }
``` ```
OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example: OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
```python ??? Code
from openai import OpenAI
```python
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server. # Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI( client = OpenAI(
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
models = client.models.list() models = client.models.list()
model = models.data[0].id model = models.data[0].id
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
# For Qwen3 series, if you want to disable thinking in reasoning mode, add: # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
# extra_body={"chat_template_kwargs": {"enable_thinking": False}} # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
stream = client.chat.completions.create(model=model, stream = client.chat.completions.create(model=model,
messages=messages, messages=messages,
stream=True) stream=True)
print("client: Start streaming chat completions...") print("client: Start streaming chat completions...")
printed_reasoning_content = False printed_reasoning_content = False
printed_content = False printed_content = False
for chunk in stream: for chunk in stream:
reasoning_content = None reasoning_content = None
content = None content = None
# Check the content is reasoning_content or content # Check the content is reasoning_content or content
...@@ -138,7 +144,7 @@ for chunk in stream: ...@@ -138,7 +144,7 @@ for chunk in stream:
print("\ncontent:", end="", flush=True) print("\ncontent:", end="", flush=True)
# Extract and print the content # Extract and print the content
print(content, end="", flush=True) print(content, end="", flush=True)
``` ```
Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
...@@ -146,12 +152,14 @@ Remember to check whether the `reasoning_content` exists in the response before ...@@ -146,12 +152,14 @@ Remember to check whether the `reasoning_content` exists in the response before
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`. The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
```python ??? Code
from openai import OpenAI
```python
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
tools = [{ tools = [{
"type": "function", "type": "function",
"function": { "function": {
"name": "get_weather", "name": "get_weather",
...@@ -165,22 +173,22 @@ tools = [{ ...@@ -165,22 +173,22 @@ tools = [{
"required": ["location", "unit"] "required": ["location", "unit"]
} }
} }
}] }]
response = client.chat.completions.create( response = client.chat.completions.create(
model=client.models.list().data[0].id, model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools, tools=tools,
tool_choice="auto" tool_choice="auto"
) )
print(response) print(response)
tool_call = response.choices[0].message.tool_calls[0].function tool_call = response.choices[0].message.tool_calls[0].function
print(f"reasoning_content: {response.choices[0].message.reasoning_content}") print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
print(f"Function called: {tool_call.name}") print(f"Function called: {tool_call.name}")
print(f"Arguments: {tool_call.arguments}") print(f"Arguments: {tool_call.arguments}")
``` ```
For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>. For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
...@@ -192,18 +200,20 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_ ...@@ -192,18 +200,20 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>. You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
```python ??? Code
# import the required packages
```python
# import the required packages
from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage) DeltaMessage)
# define a reasoning parser and register it to vllm # define a reasoning parser and register it to vllm
# the name list in register_module can be used # the name list in register_module can be used
# in --reasoning-parser. # in --reasoning-parser.
@ReasoningParserManager.register_module(["example"]) @ReasoningParserManager.register_module(["example"])
class ExampleParser(ReasoningParser): class ExampleParser(ReasoningParser):
def __init__(self, tokenizer: AnyTokenizer): def __init__(self, tokenizer: AnyTokenizer):
super().__init__(tokenizer) super().__init__(tokenizer)
...@@ -244,13 +254,15 @@ class ExampleParser(ReasoningParser): ...@@ -244,13 +254,15 @@ class ExampleParser(ReasoningParser):
tuple[Optional[str], Optional[str]] tuple[Optional[str], Optional[str]]
A tuple containing the reasoning content and the content. A tuple containing the reasoning content and the content.
""" """
``` ```
Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>. Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
```python ??? Code
@dataclass
class DeepSeekReasoner(Reasoner): ```python
@dataclass
class DeepSeekReasoner(Reasoner):
""" """
Reasoner for DeepSeek R series models. Reasoner for DeepSeek R series models.
""" """
...@@ -270,7 +282,7 @@ class DeepSeekReasoner(Reasoner): ...@@ -270,7 +282,7 @@ class DeepSeekReasoner(Reasoner):
def is_reasoning_end(self, input_ids: list[int]) -> bool: def is_reasoning_end(self, input_ids: list[int]) -> bool:
return self.end_token_id in input_ids return self.end_token_id in input_ids
... ...
``` ```
The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case. The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
......
...@@ -18,29 +18,31 @@ Speculative decoding is a technique which improves inter-token latency in memory ...@@ -18,29 +18,31 @@ Speculative decoding is a technique which improves inter-token latency in memory
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
```python ??? Code
from vllm import LLM, SamplingParams
prompts = [ ```python
from vllm import LLM, SamplingParams
prompts = [
"The future of AI is", "The future of AI is",
] ]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM( llm = LLM(
model="facebook/opt-6.7b", model="facebook/opt-6.7b",
tensor_parallel_size=1, tensor_parallel_size=1,
speculative_config={ speculative_config={
"model": "facebook/opt-125m", "model": "facebook/opt-125m",
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
}, },
) )
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` ```
To perform the same with an online mode launch the server: To perform the same with an online mode launch the server:
...@@ -60,54 +62,58 @@ python -m vllm.entrypoints.openai.api_server \ ...@@ -60,54 +62,58 @@ python -m vllm.entrypoints.openai.api_server \
Then use a client: Then use a client:
```python ??? Code
from openai import OpenAI
```python
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server. # Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI( client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY") # defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
models = client.models.list() models = client.models.list()
model = models.data[0].id model = models.data[0].id
# Completion API # Completion API
stream = False stream = False
completion = client.completions.create( completion = client.completions.create(
model=model, model=model,
prompt="The future of AI is", prompt="The future of AI is",
echo=False, echo=False,
n=1, n=1,
stream=stream, stream=stream,
) )
print("Completion results:") print("Completion results:")
if stream: if stream:
for c in completion: for c in completion:
print(c) print(c)
else: else:
print(completion) print(completion)
``` ```
## Speculating by matching n-grams in the prompt ## Speculating by matching n-grams in the prompt
The following code configures vLLM to use speculative decoding where proposals are generated by The following code configures vLLM to use speculative decoding where proposals are generated by
matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259) matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
```python ??? Code
from vllm import LLM, SamplingParams
```python
from vllm import LLM, SamplingParams
prompts = [ prompts = [
"The future of AI is", "The future of AI is",
] ]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM( llm = LLM(
model="facebook/opt-6.7b", model="facebook/opt-6.7b",
tensor_parallel_size=1, tensor_parallel_size=1,
speculative_config={ speculative_config={
...@@ -115,14 +121,14 @@ llm = LLM( ...@@ -115,14 +121,14 @@ llm = LLM(
"num_speculative_tokens": 5, "num_speculative_tokens": 5,
"prompt_lookup_max": 4, "prompt_lookup_max": 4,
}, },
) )
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` ```
## Speculating using MLP speculators ## Speculating using MLP speculators
...@@ -131,29 +137,31 @@ draft models that conditioning draft predictions on both context vectors and sam ...@@ -131,29 +137,31 @@ draft models that conditioning draft predictions on both context vectors and sam
For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
[this technical report](https://arxiv.org/abs/2404.19124). [this technical report](https://arxiv.org/abs/2404.19124).
```python ??? Code
from vllm import LLM, SamplingParams
```python
from vllm import LLM, SamplingParams
prompts = [ prompts = [
"The future of AI is", "The future of AI is",
] ]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM( llm = LLM(
model="meta-llama/Meta-Llama-3.1-70B-Instruct", model="meta-llama/Meta-Llama-3.1-70B-Instruct",
tensor_parallel_size=4, tensor_parallel_size=4,
speculative_config={ speculative_config={
"model": "ibm-ai-platform/llama3-70b-accelerator", "model": "ibm-ai-platform/llama3-70b-accelerator",
"draft_tensor_parallel_size": 1, "draft_tensor_parallel_size": 1,
}, },
) )
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` ```
Note that these speculative models currently need to be run without tensor parallelism, although Note that these speculative models currently need to be run without tensor parallelism, although
it is possible to run the main model using tensor parallelism (see example above). Since the it is possible to run the main model using tensor parallelism (see example above). Since the
...@@ -177,31 +185,33 @@ A variety of speculative models of this type are available on HF hub: ...@@ -177,31 +185,33 @@ A variety of speculative models of this type are available on HF hub:
The following code configures vLLM to use speculative decoding where proposals are generated by The following code configures vLLM to use speculative decoding where proposals are generated by
an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py). an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
```python ??? Code
from vllm import LLM, SamplingParams
prompts = [ ```python
from vllm import LLM, SamplingParams
prompts = [
"The future of AI is", "The future of AI is",
] ]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM( llm = LLM(
model="meta-llama/Meta-Llama-3-8B-Instruct", model="meta-llama/Meta-Llama-3-8B-Instruct",
tensor_parallel_size=4, tensor_parallel_size=4,
speculative_config={ speculative_config={
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"draft_tensor_parallel_size": 1, "draft_tensor_parallel_size": 1,
}, },
) )
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` ```
A few important things to consider when using the EAGLE based draft models: A few important things to consider when using the EAGLE based draft models:
......
...@@ -33,28 +33,32 @@ text. ...@@ -33,28 +33,32 @@ text.
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
```python ??? Code
from openai import OpenAI
client = OpenAI( ```python
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v1", base_url="http://localhost:8000/v1",
api_key="-", api_key="-",
) )
model = client.models.list().data[0].id model = client.models.list().data[0].id
completion = client.chat.completions.create( completion = client.chat.completions.create(
model=model, model=model,
messages=[ messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
], ],
extra_body={"guided_choice": ["positive", "negative"]}, extra_body={"guided_choice": ["positive", "negative"]},
) )
print(completion.choices[0].message.content) print(completion.choices[0].message.content)
``` ```
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template: The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
```python ??? Code
completion = client.chat.completions.create(
```python
completion = client.chat.completions.create(
model=model, model=model,
messages=[ messages=[
{ {
...@@ -63,9 +67,9 @@ completion = client.chat.completions.create( ...@@ -63,9 +67,9 @@ completion = client.chat.completions.create(
} }
], ],
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
) )
print(completion.choices[0].message.content) print(completion.choices[0].message.content)
``` ```
One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
For this we can use the `guided_json` parameter in two different ways: For this we can use the `guided_json` parameter in two different ways:
...@@ -75,24 +79,26 @@ For this we can use the `guided_json` parameter in two different ways: ...@@ -75,24 +79,26 @@ For this we can use the `guided_json` parameter in two different ways:
The next example shows how to use the `guided_json` parameter with a Pydantic model: The next example shows how to use the `guided_json` parameter with a Pydantic model:
```python ??? Code
from pydantic import BaseModel
from enum import Enum ```python
from pydantic import BaseModel
from enum import Enum
class CarType(str, Enum): class CarType(str, Enum):
sedan = "sedan" sedan = "sedan"
suv = "SUV" suv = "SUV"
truck = "Truck" truck = "Truck"
coupe = "Coupe" coupe = "Coupe"
class CarDescription(BaseModel): class CarDescription(BaseModel):
brand: str brand: str
model: str model: str
car_type: CarType car_type: CarType
json_schema = CarDescription.model_json_schema() json_schema = CarDescription.model_json_schema()
completion = client.chat.completions.create( completion = client.chat.completions.create(
model=model, model=model,
messages=[ messages=[
{ {
...@@ -107,9 +113,9 @@ completion = client.chat.completions.create( ...@@ -107,9 +113,9 @@ completion = client.chat.completions.create(
"schema": CarDescription.model_json_schema() "schema": CarDescription.model_json_schema()
}, },
}, },
) )
print(completion.choices[0].message.content) print(completion.choices[0].message.content)
``` ```
!!! tip !!! tip
While not strictly necessary, normally it´s better to indicate in the prompt the While not strictly necessary, normally it´s better to indicate in the prompt the
...@@ -121,8 +127,10 @@ difficult to use, but it´s really powerful. It allows us to define complete ...@@ -121,8 +127,10 @@ difficult to use, but it´s really powerful. It allows us to define complete
languages like SQL queries. It works by using a context free EBNF grammar. languages like SQL queries. It works by using a context free EBNF grammar.
As an example, we can use to define a specific format of simplified SQL queries: As an example, we can use to define a specific format of simplified SQL queries:
```python ??? Code
simplified_sql_grammar = """
```python
simplified_sql_grammar = """
root ::= select_statement root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition select_statement ::= "SELECT " column " from " table " where " condition
...@@ -134,9 +142,9 @@ simplified_sql_grammar = """ ...@@ -134,9 +142,9 @@ simplified_sql_grammar = """
condition ::= column "= " number condition ::= column "= " number
number ::= "1 " | "2 " number ::= "1 " | "2 "
""" """
completion = client.chat.completions.create( completion = client.chat.completions.create(
model=model, model=model,
messages=[ messages=[
{ {
...@@ -145,9 +153,9 @@ completion = client.chat.completions.create( ...@@ -145,9 +153,9 @@ completion = client.chat.completions.create(
} }
], ],
extra_body={"guided_grammar": simplified_sql_grammar}, extra_body={"guided_grammar": simplified_sql_grammar},
) )
print(completion.choices[0].message.content) print(completion.choices[0].message.content)
``` ```
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
...@@ -161,16 +169,18 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r ...@@ -161,16 +169,18 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r
Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema: Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
```python ??? Code
from pydantic import BaseModel
```python
from pydantic import BaseModel
class People(BaseModel): class People(BaseModel):
name: str name: str
age: int age: int
completion = client.chat.completions.create( completion = client.chat.completions.create(
model=model, model=model,
messages=[ messages=[
{ {
...@@ -185,10 +195,10 @@ completion = client.chat.completions.create( ...@@ -185,10 +195,10 @@ completion = client.chat.completions.create(
"schema": People.model_json_schema() "schema": People.model_json_schema()
} }
}, },
) )
print("reasoning_content: ", completion.choices[0].message.reasoning_content) print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content) print("content: ", completion.choices[0].message.content)
``` ```
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
...@@ -202,33 +212,33 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3. ...@@ -202,33 +212,33 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.
Here is a simple example demonstrating how to get structured output using Pydantic models: Here is a simple example demonstrating how to get structured output using Pydantic models:
```python ??? Code
from pydantic import BaseModel
from openai import OpenAI ```python
from pydantic import BaseModel
from openai import OpenAI
class Info(BaseModel): class Info(BaseModel):
name: str name: str
age: int age: int
client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
model = client.models.list().data[0].id model = client.models.list().data[0].id
completion = client.beta.chat.completions.parse( completion = client.beta.chat.completions.parse(
model=model, model=model,
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
], ],
response_format=Info, response_format=Info,
) )
message = completion.choices[0].message message = completion.choices[0].message
print(message) print(message)
assert message.parsed assert message.parsed
print("Name:", message.parsed.name) print("Name:", message.parsed.name)
print("Age:", message.parsed.age) print("Age:", message.parsed.age)
``` ```
Output:
```console ```console
ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
...@@ -238,35 +248,37 @@ Age: 28 ...@@ -238,35 +248,37 @@ Age: 28
Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
```python ??? Code
from typing import List
from pydantic import BaseModel
from openai import OpenAI
class Step(BaseModel): ```python
from typing import List
from pydantic import BaseModel
from openai import OpenAI
class Step(BaseModel):
explanation: str explanation: str
output: str output: str
class MathResponse(BaseModel): class MathResponse(BaseModel):
steps: list[Step] steps: list[Step]
final_answer: str final_answer: str
completion = client.beta.chat.completions.parse( completion = client.beta.chat.completions.parse(
model=model, model=model,
messages=[ messages=[
{"role": "system", "content": "You are a helpful expert math tutor."}, {"role": "system", "content": "You are a helpful expert math tutor."},
{"role": "user", "content": "Solve 8x + 31 = 2."}, {"role": "user", "content": "Solve 8x + 31 = 2."},
], ],
response_format=MathResponse, response_format=MathResponse,
) )
message = completion.choices[0].message message = completion.choices[0].message
print(message) print(message)
assert message.parsed assert message.parsed
for i, step in enumerate(message.parsed.steps): for i, step in enumerate(message.parsed.steps):
print(f"Step #{i}:", step) print(f"Step #{i}:", step)
print("Answer:", message.parsed.final_answer) print("Answer:", message.parsed.final_answer)
``` ```
Output: Output:
...@@ -296,19 +308,21 @@ These parameters can be used in the same way as the parameters from the Online ...@@ -296,19 +308,21 @@ These parameters can be used in the same way as the parameters from the Online
Serving examples above. One example for the usage of the `choice` parameter is Serving examples above. One example for the usage of the `choice` parameter is
shown below: shown below:
```python ??? Code
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") ```python
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
outputs = llm.generate( guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
outputs = llm.generate(
prompts="Classify this sentiment: vLLM is wonderful!", prompts="Classify this sentiment: vLLM is wonderful!",
sampling_params=sampling_params, sampling_params=sampling_params,
) )
print(outputs[0].outputs[0].text) print(outputs[0].outputs[0].text)
``` ```
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
...@@ -15,17 +15,19 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \ ...@@ -15,17 +15,19 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \
Next, make a request to the model that should result in it using the available tools: Next, make a request to the model that should result in it using the available tools:
```python ??? Code
from openai import OpenAI
import json ```python
from openai import OpenAI
import json
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
def get_weather(location: str, unit: str): def get_weather(location: str, unit: str):
return f"Getting the weather for {location} in {unit}..." return f"Getting the weather for {location} in {unit}..."
tool_functions = {"get_weather": get_weather} tool_functions = {"get_weather": get_weather}
tools = [{ tools = [{
"type": "function", "type": "function",
"function": { "function": {
"name": "get_weather", "name": "get_weather",
...@@ -39,20 +41,20 @@ tools = [{ ...@@ -39,20 +41,20 @@ tools = [{
"required": ["location", "unit"] "required": ["location", "unit"]
} }
} }
}] }]
response = client.chat.completions.create( response = client.chat.completions.create(
model=client.models.list().data[0].id, model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools, tools=tools,
tool_choice="auto" tool_choice="auto"
) )
tool_call = response.choices[0].message.tool_calls[0].function tool_call = response.choices[0].message.tool_calls[0].function
print(f"Function called: {tool_call.name}") print(f"Function called: {tool_call.name}")
print(f"Arguments: {tool_call.arguments}") print(f"Arguments: {tool_call.arguments}")
print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
``` ```
Example output: Example output:
...@@ -301,16 +303,18 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen ...@@ -301,16 +303,18 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen
Here is a summary of a plugin file: Here is a summary of a plugin file:
```python ??? Code
```python
# import the required packages # import the required packages
# define a tool parser and register it to vllm # define a tool parser and register it to vllm
# the name list in register_module can be used # the name list in register_module can be used
# in --tool-call-parser. you can define as many # in --tool-call-parser. you can define as many
# tool parsers as you want here. # tool parsers as you want here.
@ToolParserManager.register_module(["example"]) @ToolParserManager.register_module(["example"])
class ExampleToolParser(ToolParser): class ExampleToolParser(ToolParser):
def __init__(self, tokenizer: AnyTokenizer): def __init__(self, tokenizer: AnyTokenizer):
super().__init__(tokenizer) super().__init__(tokenizer)
...@@ -343,7 +347,7 @@ class ExampleToolParser(ToolParser): ...@@ -343,7 +347,7 @@ class ExampleToolParser(ToolParser):
tool_calls=[], tool_calls=[],
content=text) content=text)
``` ```
Then you can use this plugin in the command line like this. Then you can use this plugin in the command line like this.
......
...@@ -76,11 +76,13 @@ Currently, there are no pre-built CPU wheels. ...@@ -76,11 +76,13 @@ Currently, there are no pre-built CPU wheels.
### Build image from source ### Build image from source
```console ??? Commands
$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
```console
$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
# Launching OpenAI server # Launching OpenAI server
$ docker run --rm \ $ docker run --rm \
--privileged=true \ --privileged=true \
--shm-size=4g \ --shm-size=4g \
-p 8000:8000 \ -p 8000:8000 \
...@@ -90,7 +92,7 @@ $ docker run --rm \ ...@@ -90,7 +92,7 @@ $ docker run --rm \
--model=meta-llama/Llama-3.2-1B-Instruct \ --model=meta-llama/Llama-3.2-1B-Instruct \
--dtype=bfloat16 \ --dtype=bfloat16 \
other vLLM OpenAI server arguments other vLLM OpenAI server arguments
``` ```
!!! tip !!! tip
For ARM or Apple silicon, use `docker/Dockerfile.arm` For ARM or Apple silicon, use `docker/Dockerfile.arm`
...@@ -144,32 +146,34 @@ vllm serve facebook/opt-125m ...@@ -144,32 +146,34 @@ vllm serve facebook/opt-125m
- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
```console ??? Commands
$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
```console
# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference/basic/basic.py # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
``` $ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference/basic/basic.py
```
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment