Unverified Commit f17aec0d authored by Reid's avatar Reid Committed by GitHub
Browse files

[doc] Fold long code blocks to improve readability (#19926)


Signed-off-by: default avatarreidliu41 <reid201711@gmail.com>
Co-authored-by: default avatarreidliu41 <reid201711@gmail.com>
parent 493c2753
...@@ -448,27 +448,29 @@ elements of the entire head for all context tokens. However, overall, ...@@ -448,27 +448,29 @@ elements of the entire head for all context tokens. However, overall,
all results for output have been calculated but are just stored in all results for output have been calculated but are just stored in
different thread register memory. different thread register memory.
```cpp ??? Code
float* out_smem = reinterpret_cast<float*>(shared_mem);
for (int i = NUM_WARPS; i > 1; i /= 2) {
// Upper warps write to shared memory.
...
float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
dst[row_idx] = accs[i];
}
// Lower warps update the output. ```cpp
const float* src = &out_smem[warp_idx * HEAD_SIZE]; float* out_smem = reinterpret_cast<float*>(shared_mem);
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { for (int i = NUM_WARPS; i > 1; i /= 2) {
// Upper warps write to shared memory.
... ...
accs[i] += src[row_idx]; float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
dst[row_idx] = accs[i];
}
// Lower warps update the output.
const float* src = &out_smem[warp_idx * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
accs[i] += src[row_idx];
}
// Write out the accs.
} }
```
// Write out the accs.
}
```
## Output ## Output
......
...@@ -13,28 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture ( ...@@ -13,28 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (
vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
```python ??? Code
# inside `setup.py` file
from setuptools import setup ```python
# inside `setup.py` file
setup(name='vllm_add_dummy_model', from setuptools import setup
version='0.1',
packages=['vllm_add_dummy_model'], setup(name='vllm_add_dummy_model',
entry_points={ version='0.1',
'vllm.general_plugins': packages=['vllm_add_dummy_model'],
["register_dummy_model = vllm_add_dummy_model:register"] entry_points={
}) 'vllm.general_plugins':
["register_dummy_model = vllm_add_dummy_model:register"]
# inside `vllm_add_dummy_model.py` file })
def register():
from vllm import ModelRegistry # inside `vllm_add_dummy_model.py` file
def register():
if "MyLlava" not in ModelRegistry.get_supported_archs(): from vllm import ModelRegistry
ModelRegistry.register_model(
"MyLlava", if "MyLlava" not in ModelRegistry.get_supported_archs():
"vllm_add_dummy_model.my_llava:MyLlava", ModelRegistry.register_model(
) "MyLlava",
``` "vllm_add_dummy_model.my_llava:MyLlava",
)
```
For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
......
...@@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa ...@@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
the third parameter is the path to the LoRA adapter. the third parameter is the path to the LoRA adapter.
```python ??? Code
sampling_params = SamplingParams(
temperature=0, ```python
max_tokens=256, sampling_params = SamplingParams(
stop=["[/assistant]"] temperature=0,
) max_tokens=256,
stop=["[/assistant]"]
prompts = [ )
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", prompts = [
] "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
outputs = llm.generate( ]
prompts,
sampling_params, outputs = llm.generate(
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) prompts,
) sampling_params,
``` lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
)
```
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
...@@ -68,24 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora ...@@ -68,24 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.): with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
```bash ??? Command
curl localhost:8000/v1/models | jq .
{ ```bash
"object": "list", curl localhost:8000/v1/models | jq .
"data": [ {
{ "object": "list",
"id": "meta-llama/Llama-2-7b-hf", "data": [
"object": "model", {
... "id": "meta-llama/Llama-2-7b-hf",
}, "object": "model",
{ ...
"id": "sql-lora", },
"object": "model", {
... "id": "sql-lora",
} "object": "model",
] ...
} }
``` ]
}
```
Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
...@@ -168,36 +172,36 @@ Alternatively, follow these example steps to implement your own plugin: ...@@ -168,36 +172,36 @@ Alternatively, follow these example steps to implement your own plugin:
1. Implement the LoRAResolver interface. 1. Implement the LoRAResolver interface.
Example of a simple S3 LoRAResolver implementation: ??? Example of a simple S3 LoRAResolver implementation
```python ```python
import os import os
import s3fs import s3fs
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver from vllm.lora.resolver import LoRAResolver
class S3LoRAResolver(LoRAResolver): class S3LoRAResolver(LoRAResolver):
def __init__(self): def __init__(self):
self.s3 = s3fs.S3FileSystem() self.s3 = s3fs.S3FileSystem()
self.s3_path_format = os.getenv("S3_PATH_TEMPLATE") self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE") self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
async def resolve_lora(self, base_model_name, lora_name): async def resolve_lora(self, base_model_name, lora_name):
s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name) s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name) local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
# Download the LoRA from S3 to the local path # Download the LoRA from S3 to the local path
await self.s3._get( await self.s3._get(
s3_path, local_path, recursive=True, maxdepth=1 s3_path, local_path, recursive=True, maxdepth=1
) )
lora_request = LoRARequest( lora_request = LoRARequest(
lora_name=lora_name, lora_name=lora_name,
lora_path=local_path, lora_path=local_path,
lora_int_id=abs(hash(lora_name)) lora_int_id=abs(hash(lora_name))
) )
return lora_request return lora_request
``` ```
2. Register `LoRAResolver` plugin. 2. Register `LoRAResolver` plugin.
...@@ -234,38 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo ...@@ -234,38 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
- The `root` field points to the artifact location of the lora adapter. - The `root` field points to the artifact location of the lora adapter.
```bash ??? Command output
$ curl http://localhost:8000/v1/models
```bash
{ $ curl http://localhost:8000/v1/models
"object": "list",
"data": [ {
{ "object": "list",
"id": "meta-llama/Llama-2-7b-hf", "data": [
"object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
"parent": null,
"permission": [
{ {
..... "id": "meta-llama/Llama-2-7b-hf",
} "object": "model",
] "created": 1715644056,
}, "owned_by": "vllm",
{ "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
"id": "sql-lora", "parent": null,
"object": "model", "permission": [
"created": 1715644056, {
"owned_by": "vllm", .....
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/", }
"parent": meta-llama/Llama-2-7b-hf, ]
"permission": [ },
{ {
.... "id": "sql-lora",
"object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
"parent": meta-llama/Llama-2-7b-hf,
"permission": [
{
....
}
]
} }
] ]
} }
] ```
}
```
...@@ -20,111 +20,117 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]: ...@@ -20,111 +20,117 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
```python ??? Code
from vllm import LLM
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
# Refer to the HuggingFace repo for the correct format to use
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
# Load the image using PIL.Image
image = PIL.Image.open(...)
# Single prompt inference
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {"image": image},
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
# Batch inference
image_1 = PIL.Image.open(...)
image_2 = PIL.Image.open(...)
outputs = llm.generate(
[
{
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_1},
},
{
"prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_2},
}
]
)
for o in outputs: ```python
generated_text = o.outputs[0].text from vllm import LLM
print(generated_text)
``` llm = LLM(model="llava-hf/llava-1.5-7b-hf")
# Refer to the HuggingFace repo for the correct format to use
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
# Load the image using PIL.Image
image = PIL.Image.open(...)
# Single prompt inference
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {"image": image},
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
# Batch inference
image_1 = PIL.Image.open(...)
image_2 = PIL.Image.open(...)
outputs = llm.generate(
[
{
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_1},
},
{
"prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_2},
}
]
)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
Full example: <gh-file:examples/offline_inference/vision_language.py> Full example: <gh-file:examples/offline_inference/vision_language.py>
To substitute multiple images inside the same text prompt, you can pass in a list of images instead: To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
```python ??? Code
from vllm import LLM
```python
llm = LLM( from vllm import LLM
model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True, # Required to load Phi-3.5-vision llm = LLM(
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs model="microsoft/Phi-3.5-vision-instruct",
limit_mm_per_prompt={"image": 2}, # The maximum number to accept trust_remote_code=True, # Required to load Phi-3.5-vision
) max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
# Refer to the HuggingFace repo for the correct format to use )
prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
# Refer to the HuggingFace repo for the correct format to use
# Load the images using PIL.Image prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
image1 = PIL.Image.open(...)
image2 = PIL.Image.open(...) # Load the images using PIL.Image
image1 = PIL.Image.open(...)
outputs = llm.generate({ image2 = PIL.Image.open(...)
"prompt": prompt,
"multi_modal_data": { outputs = llm.generate({
"image": [image1, image2] "prompt": prompt,
}, "multi_modal_data": {
}) "image": [image1, image2]
},
for o in outputs: })
generated_text = o.outputs[0].text
print(generated_text) for o in outputs:
``` generated_text = o.outputs[0].text
print(generated_text)
```
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py> Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
```python ??? Code
from vllm import LLM
# Specify the maximum number of frames per video to be 4. This can be changed. ```python
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) from vllm import LLM
# Create the request payload. # Specify the maximum number of frames per video to be 4. This can be changed.
video_frames = ... # load your video making sure it only has the number of frames specified earlier. llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
message = {
"role": "user", # Create the request payload.
"content": [ video_frames = ... # load your video making sure it only has the number of frames specified earlier.
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, message = {
], "role": "user",
} "content": [
for i in range(len(video_frames)): {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
base64_image = encode_image(video_frames[i]) # base64 encoding. ],
new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} }
message["content"].append(new_image) for i in range(len(video_frames)):
base64_image = encode_image(video_frames[i]) # base64 encoding.
# Perform inference and log output. new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
outputs = llm.chat([message]) message["content"].append(new_image)
for o in outputs: # Perform inference and log output.
generated_text = o.outputs[0].text outputs = llm.chat([message])
print(generated_text)
``` for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
### Video Inputs ### Video Inputs
...@@ -144,68 +150,72 @@ Full example: <gh-file:examples/offline_inference/audio_language.py> ...@@ -144,68 +150,72 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
```python ??? Code
from vllm import LLM
# Inference with image embeddings as input ```python
llm = LLM(model="llava-hf/llava-1.5-7b-hf") from vllm import LLM
# Refer to the HuggingFace repo for the correct format to use # Inference with image embeddings as input
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:" llm = LLM(model="llava-hf/llava-1.5-7b-hf")
# Embeddings for single image # Refer to the HuggingFace repo for the correct format to use
# torch.Tensor of shape (1, image_feature_size, hidden_size of LM) prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
image_embeds = torch.load(...)
outputs = llm.generate({ # Embeddings for single image
"prompt": prompt, # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
"multi_modal_data": {"image": image_embeds}, image_embeds = torch.load(...)
})
for o in outputs: outputs = llm.generate({
generated_text = o.outputs[0].text "prompt": prompt,
print(generated_text) "multi_modal_data": {"image": image_embeds},
``` })
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
```python ??? Code
# Construct the prompt based on your model
prompt = ... ```python
# Construct the prompt based on your model
# Embeddings for multiple images prompt = ...
# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
image_embeds = torch.load(...) # Embeddings for multiple images
# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
# Qwen2-VL image_embeds = torch.load(...)
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
mm_data = { # Qwen2-VL
"image": { llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
"image_embeds": image_embeds, mm_data = {
# image_grid_thw is needed to calculate positional encoding. "image": {
"image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), "image_embeds": image_embeds,
# image_grid_thw is needed to calculate positional encoding.
"image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3),
}
} }
}
# MiniCPM-V
# MiniCPM-V llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) mm_data = {
mm_data = { "image": {
"image": { "image_embeds": image_embeds,
"image_embeds": image_embeds, # image_sizes is needed to calculate details of the sliced image.
# image_sizes is needed to calculate details of the sliced image. "image_sizes": [image.size for image in images], # list of image sizes
"image_sizes": [image.size for image in images], # list of image sizes }
} }
}
outputs = llm.generate({ outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
"multi_modal_data": mm_data, "multi_modal_data": mm_data,
}) })
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
``` ```
## Online Serving ## Online Serving
...@@ -235,51 +245,53 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ ...@@ -235,51 +245,53 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
```python ??? Code
from openai import OpenAI
```python
openai_api_key = "EMPTY" from openai import OpenAI
openai_api_base = "http://localhost:8000/v1"
openai_api_key = "EMPTY"
client = OpenAI( openai_api_base = "http://localhost:8000/v1"
api_key=openai_api_key,
base_url=openai_api_base, client = OpenAI(
) api_key=openai_api_key,
base_url=openai_api_base,
# Single-image input inference )
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
# Single-image input inference
chat_response = client.chat.completions.create( image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
model="microsoft/Phi-3.5-vision-instruct",
messages=[{ chat_response = client.chat.completions.create(
"role": "user", model="microsoft/Phi-3.5-vision-instruct",
"content": [ messages=[{
# NOTE: The prompt formatting with the image token `<image>` is not needed "role": "user",
# since the prompt will be processed automatically by the API server. "content": [
{"type": "text", "text": "What’s in this image?"}, # NOTE: The prompt formatting with the image token `<image>` is not needed
{"type": "image_url", "image_url": {"url": image_url}}, # since the prompt will be processed automatically by the API server.
], {"type": "text", "text": "What’s in this image?"},
}], {"type": "image_url", "image_url": {"url": image_url}},
) ],
print("Chat completion output:", chat_response.choices[0].message.content) }],
)
# Multi-image input inference print("Chat completion output:", chat_response.choices[0].message.content)
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" # Multi-image input inference
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
chat_response = client.chat.completions.create( image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
model="microsoft/Phi-3.5-vision-instruct",
messages=[{ chat_response = client.chat.completions.create(
"role": "user", model="microsoft/Phi-3.5-vision-instruct",
"content": [ messages=[{
{"type": "text", "text": "What are the animals in these images?"}, "role": "user",
{"type": "image_url", "image_url": {"url": image_url_duck}}, "content": [
{"type": "image_url", "image_url": {"url": image_url_lion}}, {"type": "text", "text": "What are the animals in these images?"},
], {"type": "image_url", "image_url": {"url": image_url_duck}},
}], {"type": "image_url", "image_url": {"url": image_url_lion}},
) ],
print("Chat completion output:", chat_response.choices[0].message.content) }],
``` )
print("Chat completion output:", chat_response.choices[0].message.content)
```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
...@@ -311,44 +323,46 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model ...@@ -311,44 +323,46 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
```python ??? Code
from openai import OpenAI
openai_api_key = "EMPTY" ```python
openai_api_base = "http://localhost:8000/v1" from openai import OpenAI
client = OpenAI( openai_api_key = "EMPTY"
api_key=openai_api_key, openai_api_base = "http://localhost:8000/v1"
base_url=openai_api_base,
)
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
## Use video url in the payload video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this video?"
},
{
"type": "video_url",
"video_url": {
"url": video_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content ## Use video url in the payload
print("Chat completion output from image url:", result) chat_completion_from_url = client.chat.completions.create(
``` messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this video?"
},
{
"type": "video_url",
"video_url": {
"url": video_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result)
```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
...@@ -373,84 +387,88 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b ...@@ -373,84 +387,88 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
```python ??? Code
import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
def encode_base64_content_from_url(content_url: str) -> str: ```python
"""Encode a content retrieved from a remote url to base64 format.""" import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
with requests.get(content_url) as response: def encode_base64_content_from_url(content_url: str) -> str:
response.raise_for_status() """Encode a content retrieved from a remote url to base64 format."""
result = base64.b64encode(response.content).decode('utf-8')
return result with requests.get(content_url) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
openai_api_key = "EMPTY" return result
openai_api_base = "http://localhost:8000/v1"
client = OpenAI( openai_api_key = "EMPTY"
api_key=openai_api_key, openai_api_base = "http://localhost:8000/v1"
base_url=openai_api_base,
)
# Any format supported by librosa is supported client = OpenAI(
audio_url = AudioAsset("winning_call").url api_key=openai_api_key,
audio_base64 = encode_base64_content_from_url(audio_url) base_url=openai_api_base,
)
chat_completion_from_base64 = client.chat.completions.create( # Any format supported by librosa is supported
messages=[{ audio_url = AudioAsset("winning_call").url
"role": "user", audio_base64 = encode_base64_content_from_url(audio_url)
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "input_audio",
"input_audio": {
"data": audio_base64,
"format": "wav"
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content chat_completion_from_base64 = client.chat.completions.create(
print("Chat completion output from input audio:", result) messages=[{
``` "role": "user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "input_audio",
"input_audio": {
"data": audio_base64,
"format": "wav"
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from input audio:", result)
```
Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input: Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
```python ??? Code
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
"url": audio_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content ```python
print("Chat completion output from audio url:", result) chat_completion_from_url = client.chat.completions.create(
``` messages=[{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
"url": audio_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from audio url:", result)
```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
...@@ -470,61 +488,63 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary. ...@@ -470,61 +488,63 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary.
For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field. For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
The following example demonstrates how to pass image embeddings to the OpenAI server: The following example demonstrates how to pass image embeddings to the OpenAI server:
```python ??? Code
image_embedding = torch.load(...)
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct ```python
image_embedding = torch.load(...)
buffer = io.BytesIO() grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
torch.save(image_embedding, buffer)
buffer.seek(0) buffer = io.BytesIO()
binary_data = buffer.read() torch.save(image_embedding, buffer)
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8') buffer.seek(0)
binary_data = buffer.read()
client = OpenAI( base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key, client = OpenAI(
base_url=openai_api_base, # defaults to os.environ.get("OPENAI_API_KEY")
) api_key=openai_api_key,
base_url=openai_api_base,
# Basic usage - this is equivalent to the LLaVA example for offline inference )
model = "llava-hf/llava-1.5-7b-hf"
embeds = { # Basic usage - this is equivalent to the LLaVA example for offline inference
"type": "image_embeds", model = "llava-hf/llava-1.5-7b-hf"
"image_embeds": f"{base64_image_embedding}" embeds = {
} "type": "image_embeds",
"image_embeds": f"{base64_image_embedding}"
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V) }
model = "Qwen/Qwen2-VL-2B-Instruct"
embeds = { # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
"type": "image_embeds", model = "Qwen/Qwen2-VL-2B-Instruct"
"image_embeds": { embeds = {
"image_embeds": f"{base64_image_embedding}" , # Required "type": "image_embeds",
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct "image_embeds": {
}, "image_embeds": f"{base64_image_embedding}" , # Required
} "image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
model = "openbmb/MiniCPM-V-2_6"
embeds = {
"type": "image_embeds",
"image_embeds": {
"image_embeds": f"{base64_image_embedding}" , # Required
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
},
}
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{
"type": "text",
"text": "What's in this image?",
}, },
embeds, }
], model = "openbmb/MiniCPM-V-2_6"
}, embeds = {
], "type": "image_embeds",
model=model, "image_embeds": {
) "image_embeds": f"{base64_image_embedding}" , # Required
``` "image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
},
}
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{
"type": "text",
"text": "What's in this image?",
},
embeds,
],
},
],
model=model,
)
```
!!! note !!! note
Only one message can contain `{"type": "image_embeds"}`. Only one message can contain `{"type": "image_embeds"}`.
......
...@@ -15,29 +15,31 @@ pip install autoawq ...@@ -15,29 +15,31 @@ pip install autoawq
After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
```python ??? Code
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_path = 'mistralai/Mistral-7B-Instruct-v0.2' ```python
quant_path = 'mistral-instruct-v0.2-awq' from awq import AutoAWQForCausalLM
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } from transformers import AutoTokenizer
# Load model model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
model = AutoAWQForCausalLM.from_pretrained( quant_path = 'mistral-instruct-v0.2-awq'
model_path, **{"low_cpu_mem_usage": True, "use_cache": False} quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Quantize # Load model
model.quantize(tokenizer, quant_config=quant_config) model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
# Save quantized model # Quantize
model.save_quantized(quant_path) model.quantize(tokenizer, quant_config=quant_config)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"') # Save quantized model
``` model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
print(f'Model is quantized and saved at "{quant_path}"')
```
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
...@@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \ ...@@ -49,27 +51,29 @@ python examples/offline_inference/llm_engine_example.py \
AWQ models are also supported directly through the LLM entrypoint: AWQ models are also supported directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
```python
# Sample prompts. from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is", # Sample prompts.
"The president of the United States is", prompts = [
"The capital of France is", "Hello, my name is",
"The future of AI is", "The president of the United States is",
] "The capital of France is",
# Create a sampling params object. "The future of AI is",
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ]
# Create a sampling params object.
# Create an LLM. sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
# Generate texts from the prompts. The output is a list of RequestOutput objects # Create an LLM.
# that contain the prompt, generated text, and other information. llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
outputs = llm.generate(prompts, sampling_params) # Generate texts from the prompts. The output is a list of RequestOutput objects
# Print the outputs. # that contain the prompt, generated text, and other information.
for output in outputs: outputs = llm.generate(prompts, sampling_params)
prompt = output.prompt # Print the outputs.
generated_text = output.outputs[0].text for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") prompt = output.prompt
``` generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
...@@ -43,17 +43,19 @@ llm = LLM( ...@@ -43,17 +43,19 @@ llm = LLM(
## Read gptq format checkpoint ## Read gptq format checkpoint
```python ??? Code
from vllm import LLM
import torch ```python
from vllm import LLM
# "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint. import torch
model_id = "hxbgsyxh/llama-13b-4bit-g-1"
llm = LLM( # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
model=model_id, model_id = "hxbgsyxh/llama-13b-4bit-g-1"
dtype=torch.float16, llm = LLM(
trust_remote_code=True, model=model_id,
quantization="bitblas", dtype=torch.float16,
max_model_len=1024 trust_remote_code=True,
) quantization="bitblas",
``` max_model_len=1024
)
```
...@@ -58,22 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r ...@@ -58,22 +58,24 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r
Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
```python ??? Code
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
# Configure the simple PTQ quantization ```python
recipe = QuantizationModifier( from llmcompressor.transformers import oneshot
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) from llmcompressor.modifiers.quantization import QuantizationModifier
# Apply the quantization algorithm. # Configure the simple PTQ quantization
oneshot(model=model, recipe=recipe) recipe = QuantizationModifier(
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
# Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic # Apply the quantization algorithm.
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" oneshot(model=model, recipe=recipe)
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR) # Save the model: Meta-Llama-3-8B-Instruct-FP8-Dynamic
``` SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
```
### 3. Evaluating Accuracy ### 3. Evaluating Accuracy
......
...@@ -41,42 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \ ...@@ -41,42 +41,44 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
You can also use the GGUF model directly through the LLM entrypoint: You can also use the GGUF model directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
```python
# In this script, we demonstrate how to pass input to the chat method: from vllm import LLM, SamplingParams
conversation = [
{ # In this script, we demonstrate how to pass input to the chat method:
"role": "system", conversation = [
"content": "You are a helpful assistant" {
}, "role": "system",
{ "content": "You are a helpful assistant"
"role": "user", },
"content": "Hello" {
}, "role": "user",
{ "content": "Hello"
"role": "assistant", },
"content": "Hello! How can I assist you today?" {
}, "role": "assistant",
{ "content": "Hello! How can I assist you today?"
"role": "user", },
"content": "Write an essay about the importance of higher education.", {
}, "role": "user",
] "content": "Write an essay about the importance of higher education.",
},
# Create a sampling params object. ]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create a sampling params object.
# Create an LLM. sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") # Create an LLM.
# Generate texts from the prompts. The output is a list of RequestOutput objects llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
# that contain the prompt, generated text, and other information. tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
outputs = llm.chat(conversation, sampling_params) # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
# Print the outputs. outputs = llm.chat(conversation, sampling_params)
for output in outputs:
prompt = output.prompt # Print the outputs.
generated_text = output.outputs[0].text for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") prompt = output.prompt
``` generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
...@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t ...@@ -31,28 +31,30 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
```python ??? Code
from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
model_id = "meta-llama/Llama-3.2-1B-Instruct" ```python
quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit" from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
calibration_dataset = load_dataset( model_id = "meta-llama/Llama-3.2-1B-Instruct"
"allenai/c4", quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
data_files="en/c4-train.00001-of-01024.json.gz",
split="train"
).select(range(1024))["text"]
quant_config = QuantizeConfig(bits=4, group_size=128) calibration_dataset = load_dataset(
"allenai/c4",
data_files="en/c4-train.00001-of-01024.json.gz",
split="train"
).select(range(1024))["text"]
model = GPTQModel.load(model_id, quant_config) quant_config = QuantizeConfig(bits=4, group_size=128)
# increase `batch_size` to match gpu/vram specs to speed up quantization model = GPTQModel.load(model_id, quant_config)
model.quantize(calibration_dataset, batch_size=2)
model.save(quant_path) # increase `batch_size` to match gpu/vram specs to speed up quantization
``` model.quantize(calibration_dataset, batch_size=2)
model.save(quant_path)
```
## Running a quantized model with vLLM ## Running a quantized model with vLLM
...@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \ ...@@ -67,32 +69,34 @@ python examples/offline_inference/llm_engine_example.py \
GPTQModel quantized models are also supported directly through the LLM entrypoint: GPTQModel quantized models are also supported directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
```python
# Sample prompts. from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is", # Sample prompts.
"The president of the United States is", prompts = [
"The capital of France is", "Hello, my name is",
"The future of AI is", "The president of the United States is",
] "The capital of France is",
"The future of AI is",
# Create a sampling params object. ]
sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
# Create a sampling params object.
# Create an LLM. sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
# Create an LLM.
# Generate texts from the prompts. The output is a list of RequestOutput objects llm = LLM(model="ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params) # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
# Print the outputs. outputs = llm.generate(prompts, sampling_params)
print("-"*50)
for output in outputs: # Print the outputs.
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-"*50) print("-"*50)
``` for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-"*50)
```
...@@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd ...@@ -53,51 +53,55 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
It's best to use calibration data that closely matches your deployment data. It's best to use calibration data that closely matches your deployment data.
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
```python ??? Code
from datasets import load_dataset
NUM_CALIBRATION_SAMPLES = 512 ```python
MAX_SEQUENCE_LENGTH = 2048 from datasets import load_dataset
# Load and preprocess the dataset NUM_CALIBRATION_SAMPLES = 512
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") MAX_SEQUENCE_LENGTH = 2048
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def preprocess(example): # Load and preprocess the dataset
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
ds = ds.map(preprocess) ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def tokenize(sample): def preprocess(example):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
ds = ds.map(tokenize, remove_columns=ds.column_names) ds = ds.map(preprocess)
```
def tokenize(sample):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)
```
### 3. Applying Quantization ### 3. Applying Quantization
Now, apply the quantization algorithms: Now, apply the quantization algorithms:
```python ??? Code
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
# Configure the quantization algorithms
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
# Apply quantization
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128 ```python
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" from llmcompressor.transformers import oneshot
model.save_pretrained(SAVE_DIR, save_compressed=True) from llmcompressor.modifiers.quantization import GPTQModifier
tokenizer.save_pretrained(SAVE_DIR) from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
```
# Configure the quantization algorithms
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
# Apply quantization
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save the compressed model: Meta-Llama-3-8B-Instruct-W4A16-G128
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
```
This process creates a W4A16 model with weights quantized to 4-bit integers. This process creates a W4A16 model with weights quantized to 4-bit integers.
...@@ -137,34 +141,36 @@ $ lm_eval --model vllm \ ...@@ -137,34 +141,36 @@ $ lm_eval --model vllm \
The following is an example of an expanded quantization recipe you can tune to your own use case: The following is an example of an expanded quantization recipe you can tune to your own use case:
```python ??? Code
from compressed_tensors.quantization import (
QuantizationArgs, ```python
QuantizationScheme, from compressed_tensors.quantization import (
QuantizationStrategy, QuantizationArgs,
QuantizationType, QuantizationScheme,
) QuantizationStrategy,
recipe = GPTQModifier( QuantizationType,
targets="Linear", )
config_groups={ recipe = GPTQModifier(
"config_group": QuantizationScheme( targets="Linear",
targets=["Linear"], config_groups={
weights=QuantizationArgs( "config_group": QuantizationScheme(
num_bits=4, targets=["Linear"],
type=QuantizationType.INT, weights=QuantizationArgs(
strategy=QuantizationStrategy.GROUP, num_bits=4,
group_size=128, type=QuantizationType.INT,
symmetric=True, strategy=QuantizationStrategy.GROUP,
dynamic=False, group_size=128,
actorder="weight", symmetric=True,
dynamic=False,
actorder="weight",
),
), ),
), },
}, ignore=["lm_head"],
ignore=["lm_head"], update_size=NUM_CALIBRATION_SAMPLES,
update_size=NUM_CALIBRATION_SAMPLES, dampening_frac=0.01
dampening_frac=0.01 )
) ```
```
## Troubleshooting and Support ## Troubleshooting and Support
......
...@@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa ...@@ -54,54 +54,60 @@ When quantizing activations to INT8, you need sample data to estimate the activa
It's best to use calibration data that closely matches your deployment data. It's best to use calibration data that closely matches your deployment data.
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
```python ??? Code
from datasets import load_dataset
NUM_CALIBRATION_SAMPLES = 512 ```python
MAX_SEQUENCE_LENGTH = 2048 from datasets import load_dataset
# Load and preprocess the dataset NUM_CALIBRATION_SAMPLES = 512
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") MAX_SEQUENCE_LENGTH = 2048
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def preprocess(example): # Load and preprocess the dataset
return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
ds = ds.map(preprocess) ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
def tokenize(sample): def preprocess(example):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)}
ds = ds.map(tokenize, remove_columns=ds.column_names) ds = ds.map(preprocess)
```
def tokenize(sample):
return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)
```
</details>
### 3. Applying Quantization ### 3. Applying Quantization
Now, apply the quantization algorithms: Now, apply the quantization algorithms:
```python ??? Code
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier ```python
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
# Configure the quantization algorithms from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
recipe = [
SmoothQuantModifier(smoothing_strength=0.8), # Configure the quantization algorithms
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), recipe = [
] SmoothQuantModifier(smoothing_strength=0.8),
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
# Apply quantization ]
oneshot(
model=model, # Apply quantization
dataset=ds, oneshot(
recipe=recipe, model=model,
max_seq_length=MAX_SEQUENCE_LENGTH, dataset=ds,
num_calibration_samples=NUM_CALIBRATION_SAMPLES, recipe=recipe,
) max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
# Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token )
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
model.save_pretrained(SAVE_DIR, save_compressed=True) # Save the compressed model: Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token
tokenizer.save_pretrained(SAVE_DIR) SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token"
``` model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
```
This process creates a W8A8 model with weights and activations quantized to 8-bit integers. This process creates a W8A8 model with weights and activations quantized to 8-bit integers.
......
...@@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te ...@@ -14,24 +14,26 @@ You can quantize HuggingFace models using the example scripts provided in the Te
Below is an example showing how to quantize a model using modelopt's PTQ API: Below is an example showing how to quantize a model using modelopt's PTQ API:
```python ??? Code
import modelopt.torch.quantization as mtq
from transformers import AutoModelForCausalLM
# Load the model from HuggingFace ```python
model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>") import modelopt.torch.quantization as mtq
from transformers import AutoModelForCausalLM
# Select the quantization config, for example, FP8 # Load the model from HuggingFace
config = mtq.FP8_DEFAULT_CFG model = AutoModelForCausalLM.from_pretrained("<path_or_model_id>")
# Define a forward loop function for calibration # Select the quantization config, for example, FP8
def forward_loop(model): config = mtq.FP8_DEFAULT_CFG
for data in calib_set:
model(data)
# PTQ with in-place replacement of quantized modules # Define a forward loop function for calibration
model = mtq.quantize(model, config, forward_loop) def forward_loop(model):
``` for data in calib_set:
model(data)
# PTQ with in-place replacement of quantized modules
model = mtq.quantize(model, config, forward_loop)
```
After the model is quantized, you can export it to a quantized checkpoint using the export API: After the model is quantized, you can export it to a quantized checkpoint using the export API:
...@@ -48,31 +50,33 @@ with torch.inference_mode(): ...@@ -48,31 +50,33 @@ with torch.inference_mode():
The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM: The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
```python ??? Code
from vllm import LLM, SamplingParams
def main(): ```python
from vllm import LLM, SamplingParams
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" def main():
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
sampling_params = SamplingParams(temperature=0.8, top_p=0.9) model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
outputs = llm.generate(prompts, sampling_params) prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
for output in outputs: outputs = llm.generate(prompts, sampling_params)
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__": for output in outputs:
main() prompt = output.prompt
``` generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
main()
```
...@@ -35,20 +35,22 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades ...@@ -35,20 +35,22 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades
Here is an example of how to enable FP8 quantization: Here is an example of how to enable FP8 quantization:
```python ??? Code
# To calculate kv cache scales on the fly enable the calculate_kv_scales
# parameter
from vllm import LLM, SamplingParams ```python
# To calculate kv cache scales on the fly enable the calculate_kv_scales
# parameter
sampling_params = SamplingParams(temperature=0.7, top_p=0.8) from vllm import LLM, SamplingParams
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
kv_cache_dtype="fp8", sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
calculate_kv_scales=True) llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
prompt = "London is the capital of" kv_cache_dtype="fp8",
out = llm.generate(prompt, sampling_params)[0].outputs[0].text calculate_kv_scales=True)
print(out) prompt = "London is the capital of"
``` out = llm.generate(prompt, sampling_params)[0].outputs[0].text
print(out)
```
The `kv_cache_dtype` argument specifies the data type for KV cache storage: The `kv_cache_dtype` argument specifies the data type for KV cache storage:
- `"auto"`: Uses the model's default "unquantized" data type - `"auto"`: Uses the model's default "unquantized" data type
...@@ -71,67 +73,69 @@ pip install llmcompressor ...@@ -71,67 +73,69 @@ pip install llmcompressor
Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern): Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
```python ??? Code
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer ```python
from llmcompressor.transformers import oneshot from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
# Select model and load it from llmcompressor.transformers import oneshot
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") # Select model and load it
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
# Select calibration dataset tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft" # Select calibration dataset
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
# Configure calibration parameters DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point
MAX_SEQUENCE_LENGTH = 2048 # Configure calibration parameters
NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point
# Load and preprocess dataset MAX_SEQUENCE_LENGTH = 2048
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) # Load and preprocess dataset
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
def process_and_tokenize(example): ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
return tokenizer( def process_and_tokenize(example):
text, text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
padding=False, return tokenizer(
max_length=MAX_SEQUENCE_LENGTH, text,
truncation=True, padding=False,
add_special_tokens=False, max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)
ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
# Configure quantization settings
recipe = """
quant_stage:
quant_modifiers:
QuantizationModifier:
kv_cache_scheme:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
"""
# Apply quantization
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
) )
ds = ds.map(process_and_tokenize, remove_columns=ds.column_names) # Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
# Configure quantization settings model.save_pretrained(SAVE_DIR, save_compressed=True)
recipe = """ tokenizer.save_pretrained(SAVE_DIR)
quant_stage: ```
quant_modifiers:
QuantizationModifier:
kv_cache_scheme:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
"""
# Apply quantization
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Save quantized model: Llama-3.1-8B-Instruct-FP8-KV
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
```
The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales. The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales.
......
...@@ -42,20 +42,22 @@ The Quark quantization process can be listed for 5 steps as below: ...@@ -42,20 +42,22 @@ The Quark quantization process can be listed for 5 steps as below:
Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index) Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
to fetch model and tokenizer. to fetch model and tokenizer.
```python ??? Code
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Llama-2-70b-chat-hf" ```python
MAX_SEQ_LEN = 512 from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained( MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
MODEL_ID, device_map="auto", torch_dtype="auto", MAX_SEQ_LEN = 512
)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN) model = AutoModelForCausalLM.from_pretrained(
tokenizer.pad_token = tokenizer.eos_token MODEL_ID, device_map="auto", torch_dtype="auto",
``` )
model.eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
tokenizer.pad_token = tokenizer.eos_token
```
### 2. Prepare the Calibration Dataloader ### 2. Prepare the Calibration Dataloader
...@@ -63,22 +65,24 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic ...@@ -63,22 +65,24 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic
to load calibration data. For more details about how to use calibration datasets efficiently, please refer to load calibration data. For more details about how to use calibration datasets efficiently, please refer
to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html). to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
```python ??? Code
from datasets import load_dataset
from torch.utils.data import DataLoader
BATCH_SIZE = 1 ```python
NUM_CALIBRATION_DATA = 512 from datasets import load_dataset
from torch.utils.data import DataLoader
# Load the dataset and get calibration data. BATCH_SIZE = 1
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") NUM_CALIBRATION_DATA = 512
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
tokenized_outputs = tokenizer(text_data, return_tensors="pt", # Load the dataset and get calibration data.
padding=True, truncation=True, max_length=MAX_SEQ_LEN) dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
calib_dataloader = DataLoader(tokenized_outputs['input_ids'], text_data = dataset["text"][:NUM_CALIBRATION_DATA]
batch_size=BATCH_SIZE, drop_last=True)
``` tokenized_outputs = tokenizer(text_data, return_tensors="pt",
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE, drop_last=True)
```
### 3. Set the Quantization Configuration ### 3. Set the Quantization Configuration
...@@ -94,42 +98,44 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. ...@@ -94,42 +98,44 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
AutoSmoothQuant config file for Llama is AutoSmoothQuant config file for Llama is
`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`. `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
```python ??? Code
from quark.torch.quantization import (Config, QuantizationConfig,
FP8E4M3PerTensorSpec, ```python
load_quant_algo_config_from_file) from quark.torch.quantization import (Config, QuantizationConfig,
FP8E4M3PerTensorSpec,
# Define fp8/per-tensor/static spec. load_quant_algo_config_from_file)
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
is_dynamic=False).to_quantization_spec() # Define fp8/per-tensor/static spec.
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. is_dynamic=False).to_quantization_spec()
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
weight=FP8_PER_TENSOR_SPEC) # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. weight=FP8_PER_TENSOR_SPEC)
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
kv_cache_quant_config = {name : KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
QuantizationConfig(input_tensors=global_quant_config.input_tensors, kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
weight=global_quant_config.weight, kv_cache_quant_config = {name :
output_tensors=KV_CACHE_SPEC) QuantizationConfig(input_tensors=global_quant_config.input_tensors,
for name in kv_cache_layer_names_for_llama} weight=global_quant_config.weight,
layer_quant_config = kv_cache_quant_config.copy() output_tensors=KV_CACHE_SPEC)
for name in kv_cache_layer_names_for_llama}
# Define algorithm config by config file. layer_quant_config = kv_cache_quant_config.copy()
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json' # Define algorithm config by config file.
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
EXCLUDE_LAYERS = ["lm_head"] algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
quant_config = Config(
global_quant_config=global_quant_config, EXCLUDE_LAYERS = ["lm_head"]
layer_quant_config=layer_quant_config, quant_config = Config(
kv_cache_quant_config=kv_cache_quant_config, global_quant_config=global_quant_config,
exclude=EXCLUDE_LAYERS, layer_quant_config=layer_quant_config,
algo_config=algo_config) kv_cache_quant_config=kv_cache_quant_config,
``` exclude=EXCLUDE_LAYERS,
algo_config=algo_config)
```
### 4. Quantize the Model and Export ### 4. Quantize the Model and Export
...@@ -139,63 +145,67 @@ HuggingFace `safetensors`, you can refer to ...@@ -139,63 +145,67 @@ HuggingFace `safetensors`, you can refer to
[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html) [HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
for more exporting format details. for more exporting format details.
```python ??? Code
import torch
from quark.torch import ModelQuantizer, ModelExporter ```python
from quark.torch.export import ExporterConfig, JsonExporterConfig import torch
from quark.torch import ModelQuantizer, ModelExporter
# Apply quantization. from quark.torch.export import ExporterConfig, JsonExporterConfig
quantizer = ModelQuantizer(quant_config)
quant_model = quantizer.quantize_model(model, calib_dataloader) # Apply quantization.
quantizer = ModelQuantizer(quant_config)
# Freeze quantized model to export. quant_model = quantizer.quantize_model(model, calib_dataloader)
freezed_model = quantizer.freeze(model)
# Freeze quantized model to export.
# Define export config. freezed_model = quantizer.freeze(model)
LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
export_config = ExporterConfig(json_export_config=JsonExporterConfig()) # Define export config.
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
export_config = ExporterConfig(json_export_config=JsonExporterConfig())
# Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) # Model: Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant
with torch.no_grad(): EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
exporter.export_safetensors_model(freezed_model, exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
quant_config=quant_config, tokenizer=tokenizer) with torch.no_grad():
``` exporter.export_safetensors_model(freezed_model,
quant_config=quant_config, tokenizer=tokenizer)
```
### 5. Evaluation in vLLM ### 5. Evaluation in vLLM
Now, you can load and run the Quark quantized model directly through the LLM entrypoint: Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
```python ??? Code
from vllm import LLM, SamplingParams
```python
# Sample prompts. from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is", # Sample prompts.
"The president of the United States is", prompts = [
"The capital of France is", "Hello, my name is",
"The future of AI is", "The president of the United States is",
] "The capital of France is",
# Create a sampling params object. "The future of AI is",
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ]
# Create a sampling params object.
# Create an LLM. sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
kv_cache_dtype='fp8',quantization='quark') # Create an LLM.
# Generate texts from the prompts. The output is a list of RequestOutput objects llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
# that contain the prompt, generated text, and other information. kv_cache_dtype='fp8',quantization='quark')
outputs = llm.generate(prompts, sampling_params) # Generate texts from the prompts. The output is a list of RequestOutput objects
# Print the outputs. # that contain the prompt, generated text, and other information.
print("\nGenerated Outputs:\n" + "-" * 60) outputs = llm.generate(prompts, sampling_params)
for output in outputs: # Print the outputs.
prompt = output.prompt print("\nGenerated Outputs:\n" + "-" * 60)
generated_text = output.outputs[0].text for output in outputs:
print(f"Prompt: {prompt!r}") prompt = output.prompt
print(f"Output: {generated_text!r}") generated_text = output.outputs[0].text
print("-" * 60) print(f"Prompt: {prompt!r}")
``` print(f"Output: {generated_text!r}")
print("-" * 60)
```
Or, you can use `lm_eval` to evaluate accuracy: Or, you can use `lm_eval` to evaluate accuracy:
......
...@@ -15,26 +15,28 @@ pip install \ ...@@ -15,26 +15,28 @@ pip install \
## Quantizing HuggingFace Models ## Quantizing HuggingFace Models
You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code: You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
```Python ??? Code
import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer ```Python
from torchao.quantization import Int8WeightOnlyConfig import torch
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
model_name = "meta-llama/Meta-Llama-3-8B" from torchao.quantization import Int8WeightOnlyConfig
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
quantized_model = AutoModelForCausalLM.from_pretrained( model_name = "meta-llama/Meta-Llama-3-8B"
model_name, quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
torch_dtype="auto", quantized_model = AutoModelForCausalLM.from_pretrained(
device_map="auto", model_name,
quantization_config=quantization_config torch_dtype="auto",
) device_map="auto",
tokenizer = AutoTokenizer.from_pretrained(model_name) quantization_config=quantization_config
input_text = "What are we having for dinner?" )
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") tokenizer = AutoTokenizer.from_pretrained(model_name)
input_text = "What are we having for dinner?"
hub_repo = # YOUR HUB REPO ID input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
tokenizer.push_to_hub(hub_repo)
quantized_model.push_to_hub(hub_repo, safe_serialization=False) hub_repo = # YOUR HUB REPO ID
``` tokenizer.push_to_hub(hub_repo)
quantized_model.push_to_hub(hub_repo, safe_serialization=False)
```
Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI. Alternatively, you can use the [TorchAO Quantization space](https://huggingface.co/spaces/medmekk/TorchAO_Quantization) for quantizing models with a simple UI.
...@@ -33,34 +33,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ ...@@ -33,34 +33,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
Next, make a request to the model that should return the reasoning content in the response. Next, make a request to the model that should return the reasoning content in the response.
```python ??? Code
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server. ```python
openai_api_key = "EMPTY" from openai import OpenAI
openai_api_base = "http://localhost:8000/v1"
client = OpenAI( # Modify OpenAI's API key and API base to use vLLM's API server.
api_key=openai_api_key, openai_api_key = "EMPTY"
base_url=openai_api_base, openai_api_base = "http://localhost:8000/v1"
)
models = client.models.list() client = OpenAI(
model = models.data[0].id api_key=openai_api_key,
base_url=openai_api_base,
)
# Round 1 models = client.models.list()
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] model = models.data[0].id
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
response = client.chat.completions.create(model=model, messages=messages)
reasoning_content = response.choices[0].message.reasoning_content # Round 1
content = response.choices[0].message.content messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
response = client.chat.completions.create(model=model, messages=messages)
print("reasoning_content:", reasoning_content) reasoning_content = response.choices[0].message.reasoning_content
print("content:", content) content = response.choices[0].message.content
```
print("reasoning_content:", reasoning_content)
print("content:", content)
```
The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion. The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
...@@ -68,77 +70,81 @@ The `reasoning_content` field contains the reasoning steps that led to the final ...@@ -68,77 +70,81 @@ The `reasoning_content` field contains the reasoning steps that led to the final
Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming). Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
```json ??? Json
{
"id": "chatcmpl-123", ```json
"object": "chat.completion.chunk", {
"created": 1694268190, "id": "chatcmpl-123",
"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "object": "chat.completion.chunk",
"system_fingerprint": "fp_44709d6fcb", "created": 1694268190,
"choices": [ "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
{ "system_fingerprint": "fp_44709d6fcb",
"index": 0, "choices": [
"delta": { {
"role": "assistant", "index": 0,
"reasoning_content": "is", "delta": {
}, "role": "assistant",
"logprobs": null, "reasoning_content": "is",
"finish_reason": null },
} "logprobs": null,
] "finish_reason": null
} }
``` ]
}
```
OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example: OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
```python ??? Code
from openai import OpenAI
```python
# Modify OpenAI's API key and API base to use vLLM's API server. from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" # Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
client = OpenAI( openai_api_base = "http://localhost:8000/v1"
api_key=openai_api_key,
base_url=openai_api_base, client = OpenAI(
) api_key=openai_api_key,
base_url=openai_api_base,
models = client.models.list() )
model = models.data[0].id
models = client.models.list()
messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] model = models.data[0].id
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
# For Qwen3 series, if you want to disable thinking in reasoning mode, add: messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
# extra_body={"chat_template_kwargs": {"enable_thinking": False}} # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
stream = client.chat.completions.create(model=model, # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
messages=messages, # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
stream=True) stream = client.chat.completions.create(model=model,
messages=messages,
print("client: Start streaming chat completions...") stream=True)
printed_reasoning_content = False
printed_content = False print("client: Start streaming chat completions...")
printed_reasoning_content = False
for chunk in stream: printed_content = False
reasoning_content = None
content = None for chunk in stream:
# Check the content is reasoning_content or content reasoning_content = None
if hasattr(chunk.choices[0].delta, "reasoning_content"): content = None
reasoning_content = chunk.choices[0].delta.reasoning_content # Check the content is reasoning_content or content
elif hasattr(chunk.choices[0].delta, "content"): if hasattr(chunk.choices[0].delta, "reasoning_content"):
content = chunk.choices[0].delta.content reasoning_content = chunk.choices[0].delta.reasoning_content
elif hasattr(chunk.choices[0].delta, "content"):
if reasoning_content is not None: content = chunk.choices[0].delta.content
if not printed_reasoning_content:
printed_reasoning_content = True if reasoning_content is not None:
print("reasoning_content:", end="", flush=True) if not printed_reasoning_content:
print(reasoning_content, end="", flush=True) printed_reasoning_content = True
elif content is not None: print("reasoning_content:", end="", flush=True)
if not printed_content: print(reasoning_content, end="", flush=True)
printed_content = True elif content is not None:
print("\ncontent:", end="", flush=True) if not printed_content:
# Extract and print the content printed_content = True
print(content, end="", flush=True) print("\ncontent:", end="", flush=True)
``` # Extract and print the content
print(content, end="", flush=True)
```
Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
...@@ -146,41 +152,43 @@ Remember to check whether the `reasoning_content` exists in the response before ...@@ -146,41 +152,43 @@ Remember to check whether the `reasoning_content` exists in the response before
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`. The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
```python ??? Code
from openai import OpenAI
```python
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") from openai import OpenAI
tools = [{ client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
"type": "function",
"function": { tools = [{
"name": "get_weather", "type": "function",
"description": "Get the current weather in a given location", "function": {
"parameters": { "name": "get_weather",
"type": "object", "description": "Get the current weather in a given location",
"properties": { "parameters": {
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, "type": "object",
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} "properties": {
}, "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"required": ["location", "unit"] "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["location", "unit"]
}
} }
} }]
}]
response = client.chat.completions.create( response = client.chat.completions.create(
model=client.models.list().data[0].id, model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools, tools=tools,
tool_choice="auto" tool_choice="auto"
) )
print(response) print(response)
tool_call = response.choices[0].message.tool_calls[0].function tool_call = response.choices[0].message.tool_calls[0].function
print(f"reasoning_content: {response.choices[0].message.reasoning_content}") print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
print(f"Function called: {tool_call.name}") print(f"Function called: {tool_call.name}")
print(f"Arguments: {tool_call.arguments}") print(f"Arguments: {tool_call.arguments}")
``` ```
For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>. For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>.
...@@ -192,85 +200,89 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_ ...@@ -192,85 +200,89 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>. You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
```python ??? Code
# import the required packages
```python
from vllm.reasoning import ReasoningParser, ReasoningParserManager # import the required packages
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage) from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
# define a reasoning parser and register it to vllm DeltaMessage)
# the name list in register_module can be used
# in --reasoning-parser. # define a reasoning parser and register it to vllm
@ReasoningParserManager.register_module(["example"]) # the name list in register_module can be used
class ExampleParser(ReasoningParser): # in --reasoning-parser.
def __init__(self, tokenizer: AnyTokenizer): @ReasoningParserManager.register_module(["example"])
super().__init__(tokenizer) class ExampleParser(ReasoningParser):
def __init__(self, tokenizer: AnyTokenizer):
def extract_reasoning_content_streaming( super().__init__(tokenizer)
self,
previous_text: str, def extract_reasoning_content_streaming(
current_text: str, self,
delta_text: str, previous_text: str,
previous_token_ids: Sequence[int], current_text: str,
current_token_ids: Sequence[int], delta_text: str,
delta_token_ids: Sequence[int], previous_token_ids: Sequence[int],
) -> Union[DeltaMessage, None]: current_token_ids: Sequence[int],
""" delta_token_ids: Sequence[int],
Instance method that should be implemented for extracting reasoning ) -> Union[DeltaMessage, None]:
from an incomplete response; for use when handling reasoning calls and """
streaming. Has to be an instance method because it requires state - Instance method that should be implemented for extracting reasoning
the current tokens/diffs, but also the information about what has from an incomplete response; for use when handling reasoning calls and
previously been parsed and extracted (see constructor) streaming. Has to be an instance method because it requires state -
""" the current tokens/diffs, but also the information about what has
previously been parsed and extracted (see constructor)
def extract_reasoning_content( """
self, model_output: str, request: ChatCompletionRequest
) -> tuple[Optional[str], Optional[str]]: def extract_reasoning_content(
""" self, model_output: str, request: ChatCompletionRequest
Extract reasoning content from a complete model-generated string. ) -> tuple[Optional[str], Optional[str]]:
"""
Used for non-streaming responses where we have the entire model response Extract reasoning content from a complete model-generated string.
available before sending to the client.
Used for non-streaming responses where we have the entire model response
available before sending to the client.
Parameters:
model_output: str
The model-generated string to extract reasoning content from.
request: ChatCompletionRequest
The request object that was used to generate the model_output.
Returns:
tuple[Optional[str], Optional[str]]
A tuple containing the reasoning content and the content.
"""
```
Parameters: Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
model_output: str
The model-generated string to extract reasoning content from.
request: ChatCompletionRequest ??? Code
The request object that was used to generate the model_output.
Returns: ```python
tuple[Optional[str], Optional[str]] @dataclass
A tuple containing the reasoning content and the content. class DeepSeekReasoner(Reasoner):
""" """
``` Reasoner for DeepSeek R series models.
"""
Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>. start_token_id: int
end_token_id: int
```python
@dataclass start_token: str = "<think>"
class DeepSeekReasoner(Reasoner): end_token: str = "</think>"
"""
Reasoner for DeepSeek R series models. @classmethod
""" def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
start_token_id: int return cls(start_token_id=tokenizer.encode(
end_token_id: int "<think>", add_special_tokens=False)[0],
end_token_id=tokenizer.encode("</think>",
start_token: str = "<think>" add_special_tokens=False)[0])
end_token: str = "</think>"
def is_reasoning_end(self, input_ids: list[int]) -> bool:
@classmethod return self.end_token_id in input_ids
def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner: ...
return cls(start_token_id=tokenizer.encode( ```
"<think>", add_special_tokens=False)[0],
end_token_id=tokenizer.encode("</think>",
add_special_tokens=False)[0])
def is_reasoning_end(self, input_ids: list[int]) -> bool:
return self.end_token_id in input_ids
...
```
The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case. The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
......
...@@ -18,29 +18,31 @@ Speculative decoding is a technique which improves inter-token latency in memory ...@@ -18,29 +18,31 @@ Speculative decoding is a technique which improves inter-token latency in memory
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
```python ??? Code
from vllm import LLM, SamplingParams
```python
prompts = [ from vllm import LLM, SamplingParams
"The future of AI is",
] prompts = [
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) "The future of AI is",
]
llm = LLM( sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
model="facebook/opt-6.7b",
tensor_parallel_size=1, llm = LLM(
speculative_config={ model="facebook/opt-6.7b",
"model": "facebook/opt-125m", tensor_parallel_size=1,
"num_speculative_tokens": 5, speculative_config={
}, "model": "facebook/opt-125m",
) "num_speculative_tokens": 5,
outputs = llm.generate(prompts, sampling_params) },
)
for output in outputs: outputs = llm.generate(prompts, sampling_params)
prompt = output.prompt
generated_text = output.outputs[0].text for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") prompt = output.prompt
``` generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
To perform the same with an online mode launch the server: To perform the same with an online mode launch the server:
...@@ -60,69 +62,73 @@ python -m vllm.entrypoints.openai.api_server \ ...@@ -60,69 +62,73 @@ python -m vllm.entrypoints.openai.api_server \
Then use a client: Then use a client:
```python ??? Code
from openai import OpenAI
```python
# Modify OpenAI's API key and API base to use vLLM's API server. from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" # Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
client = OpenAI( openai_api_base = "http://localhost:8000/v1"
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key, client = OpenAI(
base_url=openai_api_base, # defaults to os.environ.get("OPENAI_API_KEY")
) api_key=openai_api_key,
base_url=openai_api_base,
models = client.models.list() )
model = models.data[0].id
models = client.models.list()
# Completion API model = models.data[0].id
stream = False
completion = client.completions.create( # Completion API
model=model, stream = False
prompt="The future of AI is", completion = client.completions.create(
echo=False, model=model,
n=1, prompt="The future of AI is",
stream=stream, echo=False,
) n=1,
stream=stream,
print("Completion results:") )
if stream:
for c in completion: print("Completion results:")
print(c) if stream:
else: for c in completion:
print(completion) print(c)
``` else:
print(completion)
```
## Speculating by matching n-grams in the prompt ## Speculating by matching n-grams in the prompt
The following code configures vLLM to use speculative decoding where proposals are generated by The following code configures vLLM to use speculative decoding where proposals are generated by
matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259) matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
```python ??? Code
from vllm import LLM, SamplingParams
```python
prompts = [ from vllm import LLM, SamplingParams
"The future of AI is",
] prompts = [
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) "The future of AI is",
]
llm = LLM( sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
model="facebook/opt-6.7b",
tensor_parallel_size=1, llm = LLM(
speculative_config={ model="facebook/opt-6.7b",
"method": "ngram", tensor_parallel_size=1,
"num_speculative_tokens": 5, speculative_config={
"prompt_lookup_max": 4, "method": "ngram",
}, "num_speculative_tokens": 5,
) "prompt_lookup_max": 4,
outputs = llm.generate(prompts, sampling_params) },
)
for output in outputs: outputs = llm.generate(prompts, sampling_params)
prompt = output.prompt
generated_text = output.outputs[0].text for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") prompt = output.prompt
``` generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
## Speculating using MLP speculators ## Speculating using MLP speculators
...@@ -131,29 +137,31 @@ draft models that conditioning draft predictions on both context vectors and sam ...@@ -131,29 +137,31 @@ draft models that conditioning draft predictions on both context vectors and sam
For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
[this technical report](https://arxiv.org/abs/2404.19124). [this technical report](https://arxiv.org/abs/2404.19124).
```python ??? Code
from vllm import LLM, SamplingParams
```python
prompts = [ from vllm import LLM, SamplingParams
"The future of AI is",
] prompts = [
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) "The future of AI is",
]
llm = LLM( sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
model="meta-llama/Meta-Llama-3.1-70B-Instruct",
tensor_parallel_size=4, llm = LLM(
speculative_config={ model="meta-llama/Meta-Llama-3.1-70B-Instruct",
"model": "ibm-ai-platform/llama3-70b-accelerator", tensor_parallel_size=4,
"draft_tensor_parallel_size": 1, speculative_config={
}, "model": "ibm-ai-platform/llama3-70b-accelerator",
) "draft_tensor_parallel_size": 1,
outputs = llm.generate(prompts, sampling_params) },
)
for output in outputs: outputs = llm.generate(prompts, sampling_params)
prompt = output.prompt
generated_text = output.outputs[0].text for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") prompt = output.prompt
``` generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
Note that these speculative models currently need to be run without tensor parallelism, although Note that these speculative models currently need to be run without tensor parallelism, although
it is possible to run the main model using tensor parallelism (see example above). Since the it is possible to run the main model using tensor parallelism (see example above). Since the
...@@ -177,31 +185,33 @@ A variety of speculative models of this type are available on HF hub: ...@@ -177,31 +185,33 @@ A variety of speculative models of this type are available on HF hub:
The following code configures vLLM to use speculative decoding where proposals are generated by The following code configures vLLM to use speculative decoding where proposals are generated by
an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py). an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
```python ??? Code
from vllm import LLM, SamplingParams
prompts = [ ```python
"The future of AI is", from vllm import LLM, SamplingParams
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM( prompts = [
model="meta-llama/Meta-Llama-3-8B-Instruct", "The future of AI is",
tensor_parallel_size=4, ]
speculative_config={ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"draft_tensor_parallel_size": 1,
},
)
outputs = llm.generate(prompts, sampling_params) llm = LLM(
model="meta-llama/Meta-Llama-3-8B-Instruct",
tensor_parallel_size=4,
speculative_config={
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"draft_tensor_parallel_size": 1,
},
)
for output in outputs: outputs = llm.generate(prompts, sampling_params)
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
A few important things to consider when using the EAGLE based draft models: A few important things to consider when using the EAGLE based draft models:
......
...@@ -33,39 +33,43 @@ text. ...@@ -33,39 +33,43 @@ text.
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
```python ??? Code
from openai import OpenAI
client = OpenAI( ```python
base_url="http://localhost:8000/v1", from openai import OpenAI
api_key="-", client = OpenAI(
) base_url="http://localhost:8000/v1",
model = client.models.list().data[0].id api_key="-",
)
completion = client.chat.completions.create( model = client.models.list().data[0].id
model=model,
messages=[ completion = client.chat.completions.create(
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} model=model,
], messages=[
extra_body={"guided_choice": ["positive", "negative"]}, {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
) ],
print(completion.choices[0].message.content) extra_body={"guided_choice": ["positive", "negative"]},
``` )
print(completion.choices[0].message.content)
```
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template: The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
```python ??? Code
completion = client.chat.completions.create(
model=model, ```python
messages=[ completion = client.chat.completions.create(
{ model=model,
"role": "user", messages=[
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", {
} "role": "user",
], "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, }
) ],
print(completion.choices[0].message.content) extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
``` )
print(completion.choices[0].message.content)
```
One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
For this we can use the `guided_json` parameter in two different ways: For this we can use the `guided_json` parameter in two different ways:
...@@ -75,41 +79,43 @@ For this we can use the `guided_json` parameter in two different ways: ...@@ -75,41 +79,43 @@ For this we can use the `guided_json` parameter in two different ways:
The next example shows how to use the `guided_json` parameter with a Pydantic model: The next example shows how to use the `guided_json` parameter with a Pydantic model:
```python ??? Code
from pydantic import BaseModel
from enum import Enum ```python
from pydantic import BaseModel
class CarType(str, Enum): from enum import Enum
sedan = "sedan"
suv = "SUV" class CarType(str, Enum):
truck = "Truck" sedan = "sedan"
coupe = "Coupe" suv = "SUV"
truck = "Truck"
class CarDescription(BaseModel): coupe = "Coupe"
brand: str
model: str class CarDescription(BaseModel):
car_type: CarType brand: str
model: str
json_schema = CarDescription.model_json_schema() car_type: CarType
completion = client.chat.completions.create( json_schema = CarDescription.model_json_schema()
model=model,
messages=[ completion = client.chat.completions.create(
{ model=model,
"role": "user", messages=[
"content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", {
} "role": "user",
], "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's",
"response_format": { }
"type": "json_schema", ],
"json_schema": { "response_format": {
"name": "car-description", "type": "json_schema",
"schema": CarDescription.model_json_schema() "json_schema": {
"name": "car-description",
"schema": CarDescription.model_json_schema()
},
}, },
}, )
) print(completion.choices[0].message.content)
print(completion.choices[0].message.content) ```
```
!!! tip !!! tip
While not strictly necessary, normally it´s better to indicate in the prompt the While not strictly necessary, normally it´s better to indicate in the prompt the
...@@ -121,33 +127,35 @@ difficult to use, but it´s really powerful. It allows us to define complete ...@@ -121,33 +127,35 @@ difficult to use, but it´s really powerful. It allows us to define complete
languages like SQL queries. It works by using a context free EBNF grammar. languages like SQL queries. It works by using a context free EBNF grammar.
As an example, we can use to define a specific format of simplified SQL queries: As an example, we can use to define a specific format of simplified SQL queries:
```python ??? Code
simplified_sql_grammar = """
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition ```python
simplified_sql_grammar = """
root ::= select_statement
column ::= "col_1 " | "col_2 " select_statement ::= "SELECT " column " from " table " where " condition
table ::= "table_1 " | "table_2 " column ::= "col_1 " | "col_2 "
condition ::= column "= " number table ::= "table_1 " | "table_2 "
number ::= "1 " | "2 " condition ::= column "= " number
"""
completion = client.chat.completions.create( number ::= "1 " | "2 "
model=model, """
messages=[
{ completion = client.chat.completions.create(
"role": "user", model=model,
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", messages=[
} {
], "role": "user",
extra_body={"guided_grammar": simplified_sql_grammar}, "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
) }
print(completion.choices[0].message.content) ],
``` extra_body={"guided_grammar": simplified_sql_grammar},
)
print(completion.choices[0].message.content)
```
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
...@@ -161,34 +169,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r ...@@ -161,34 +169,36 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r
Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema: Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
```python ??? Code
from pydantic import BaseModel
```python
from pydantic import BaseModel
class People(BaseModel):
name: str
age: int class People(BaseModel):
name: str
age: int
completion = client.chat.completions.create(
model=model,
messages=[ completion = client.chat.completions.create(
{ model=model,
"role": "user", messages=[
"content": "Generate a JSON with the name and age of one random person.", {
} "role": "user",
], "content": "Generate a JSON with the name and age of one random person.",
response_format={ }
"type": "json_schema", ],
"json_schema": { response_format={
"name": "people", "type": "json_schema",
"schema": People.model_json_schema() "json_schema": {
} "name": "people",
}, "schema": People.model_json_schema()
) }
print("reasoning_content: ", completion.choices[0].message.reasoning_content) },
print("content: ", completion.choices[0].message.content) )
``` print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content)
```
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
...@@ -202,33 +212,33 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3. ...@@ -202,33 +212,33 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.
Here is a simple example demonstrating how to get structured output using Pydantic models: Here is a simple example demonstrating how to get structured output using Pydantic models:
```python ??? Code
from pydantic import BaseModel
from openai import OpenAI ```python
from pydantic import BaseModel
class Info(BaseModel): from openai import OpenAI
name: str
age: int class Info(BaseModel):
name: str
client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") age: int
model = client.models.list().data[0].id
completion = client.beta.chat.completions.parse( client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
model=model, model = client.models.list().data[0].id
messages=[ completion = client.beta.chat.completions.parse(
{"role": "system", "content": "You are a helpful assistant."}, model=model,
{"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, messages=[
], {"role": "system", "content": "You are a helpful assistant."},
response_format=Info, {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"},
) ],
response_format=Info,
message = completion.choices[0].message )
print(message)
assert message.parsed message = completion.choices[0].message
print("Name:", message.parsed.name) print(message)
print("Age:", message.parsed.age) assert message.parsed
``` print("Name:", message.parsed.name)
print("Age:", message.parsed.age)
Output: ```
```console ```console
ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28))
...@@ -238,35 +248,37 @@ Age: 28 ...@@ -238,35 +248,37 @@ Age: 28
Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
```python ??? Code
from typing import List
from pydantic import BaseModel ```python
from openai import OpenAI from typing import List
from pydantic import BaseModel
class Step(BaseModel): from openai import OpenAI
explanation: str
output: str class Step(BaseModel):
explanation: str
class MathResponse(BaseModel): output: str
steps: list[Step]
final_answer: str class MathResponse(BaseModel):
steps: list[Step]
completion = client.beta.chat.completions.parse( final_answer: str
model=model,
messages=[ completion = client.beta.chat.completions.parse(
{"role": "system", "content": "You are a helpful expert math tutor."}, model=model,
{"role": "user", "content": "Solve 8x + 31 = 2."}, messages=[
], {"role": "system", "content": "You are a helpful expert math tutor."},
response_format=MathResponse, {"role": "user", "content": "Solve 8x + 31 = 2."},
) ],
response_format=MathResponse,
message = completion.choices[0].message )
print(message)
assert message.parsed message = completion.choices[0].message
for i, step in enumerate(message.parsed.steps): print(message)
print(f"Step #{i}:", step) assert message.parsed
print("Answer:", message.parsed.final_answer) for i, step in enumerate(message.parsed.steps):
``` print(f"Step #{i}:", step)
print("Answer:", message.parsed.final_answer)
```
Output: Output:
...@@ -296,19 +308,21 @@ These parameters can be used in the same way as the parameters from the Online ...@@ -296,19 +308,21 @@ These parameters can be used in the same way as the parameters from the Online
Serving examples above. One example for the usage of the `choice` parameter is Serving examples above. One example for the usage of the `choice` parameter is
shown below: shown below:
```python ??? Code
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") ```python
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
outputs = llm.generate( guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
prompts="Classify this sentiment: vLLM is wonderful!", sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
sampling_params=sampling_params, outputs = llm.generate(
) prompts="Classify this sentiment: vLLM is wonderful!",
print(outputs[0].outputs[0].text) sampling_params=sampling_params,
``` )
print(outputs[0].outputs[0].text)
```
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html) See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
...@@ -15,44 +15,46 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \ ...@@ -15,44 +15,46 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \
Next, make a request to the model that should result in it using the available tools: Next, make a request to the model that should result in it using the available tools:
```python ??? Code
from openai import OpenAI
import json ```python
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") import json
def get_weather(location: str, unit: str): client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
return f"Getting the weather for {location} in {unit}..."
tool_functions = {"get_weather": get_weather} def get_weather(location: str, unit: str):
return f"Getting the weather for {location} in {unit}..."
tools = [{ tool_functions = {"get_weather": get_weather}
"type": "function",
"function": { tools = [{
"name": "get_weather", "type": "function",
"description": "Get the current weather in a given location", "function": {
"parameters": { "name": "get_weather",
"type": "object", "description": "Get the current weather in a given location",
"properties": { "parameters": {
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, "type": "object",
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} "properties": {
}, "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"required": ["location", "unit"] "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["location", "unit"]
}
} }
} }]
}]
response = client.chat.completions.create(
response = client.chat.completions.create( model=client.models.list().data[0].id,
model=client.models.list().data[0].id, messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], tools=tools,
tools=tools, tool_choice="auto"
tool_choice="auto" )
)
tool_call = response.choices[0].message.tool_calls[0].function
tool_call = response.choices[0].message.tool_calls[0].function print(f"Function called: {tool_call.name}")
print(f"Function called: {tool_call.name}") print(f"Arguments: {tool_call.arguments}")
print(f"Arguments: {tool_call.arguments}") print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") ```
```
Example output: Example output:
...@@ -301,49 +303,51 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen ...@@ -301,49 +303,51 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen
Here is a summary of a plugin file: Here is a summary of a plugin file:
```python ??? Code
# import the required packages ```python
# define a tool parser and register it to vllm # import the required packages
# the name list in register_module can be used
# in --tool-call-parser. you can define as many # define a tool parser and register it to vllm
# tool parsers as you want here. # the name list in register_module can be used
@ToolParserManager.register_module(["example"]) # in --tool-call-parser. you can define as many
class ExampleToolParser(ToolParser): # tool parsers as you want here.
def __init__(self, tokenizer: AnyTokenizer): @ToolParserManager.register_module(["example"])
super().__init__(tokenizer) class ExampleToolParser(ToolParser):
def __init__(self, tokenizer: AnyTokenizer):
# adjust request. e.g.: set skip special tokens super().__init__(tokenizer)
# to False for tool call output.
def adjust_request( # adjust request. e.g.: set skip special tokens
self, request: ChatCompletionRequest) -> ChatCompletionRequest: # to False for tool call output.
return request def adjust_request(
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
# implement the tool call parse for stream call return request
def extract_tool_calls_streaming(
self, # implement the tool call parse for stream call
previous_text: str, def extract_tool_calls_streaming(
current_text: str, self,
delta_text: str, previous_text: str,
previous_token_ids: Sequence[int], current_text: str,
current_token_ids: Sequence[int], delta_text: str,
delta_token_ids: Sequence[int], previous_token_ids: Sequence[int],
request: ChatCompletionRequest, current_token_ids: Sequence[int],
) -> Union[DeltaMessage, None]: delta_token_ids: Sequence[int],
return delta request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]:
# implement the tool parse for non-stream call return delta
def extract_tool_calls(
self, # implement the tool parse for non-stream call
model_output: str, def extract_tool_calls(
request: ChatCompletionRequest, self,
) -> ExtractedToolCallInformation: model_output: str,
return ExtractedToolCallInformation(tools_called=False, request: ChatCompletionRequest,
tool_calls=[], ) -> ExtractedToolCallInformation:
content=text) return ExtractedToolCallInformation(tools_called=False,
tool_calls=[],
``` content=text)
```
Then you can use this plugin in the command line like this. Then you can use this plugin in the command line like this.
......
...@@ -76,21 +76,23 @@ Currently, there are no pre-built CPU wheels. ...@@ -76,21 +76,23 @@ Currently, there are no pre-built CPU wheels.
### Build image from source ### Build image from source
```console ??? Commands
$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
```console
# Launching OpenAI server $ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
$ docker run --rm \
--privileged=true \ # Launching OpenAI server
--shm-size=4g \ $ docker run --rm \
-p 8000:8000 \ --privileged=true \
-e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \ --shm-size=4g \
-e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \ -p 8000:8000 \
vllm-cpu-env \ -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
--model=meta-llama/Llama-3.2-1B-Instruct \ -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
--dtype=bfloat16 \ vllm-cpu-env \
other vLLM OpenAI server arguments --model=meta-llama/Llama-3.2-1B-Instruct \
``` --dtype=bfloat16 \
other vLLM OpenAI server arguments
```
!!! tip !!! tip
For ARM or Apple silicon, use `docker/Dockerfile.arm` For ARM or Apple silicon, use `docker/Dockerfile.arm`
...@@ -144,32 +146,34 @@ vllm serve facebook/opt-125m ...@@ -144,32 +146,34 @@ vllm serve facebook/opt-125m
- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
```console ??? Commands
$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
```console
# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core.
1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000
11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000
12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000
13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000
14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000
15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000
14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
$ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference/basic/basic.py # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
``` $ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference/basic/basic.py
```
- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment