Commit 006693ed authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.11.2' into v0.11.2-ori

parents 4b51e6f1 275de341
...@@ -6,7 +6,11 @@ This quantization method is particularly useful for reducing model size while ma ...@@ -6,7 +6,11 @@ This quantization method is particularly useful for reducing model size while ma
Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415). Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
!!! note !!! note
INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell). INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper).
!!! warning
**Blackwell GPU Limitation**: INT8 is not supported on compute capability >= 100 (e.g., RTX 6000 Blackwell).
Use [FP8 quantization](fp8.md) instead, or run on Hopper/Ada/Ampere architectures.
## Prerequisites ## Prerequisites
...@@ -40,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM ...@@ -40,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```
......
...@@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll ...@@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
def main(): def main():
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
# Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True) llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
sampling_params = SamplingParams(temperature=0.8, top_p=0.9) sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
......
...@@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization: ...@@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
sampling_params = SamplingParams(temperature=0.7, top_p=0.8) sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", llm = LLM(
kv_cache_dtype="fp8", model="meta-llama/Llama-2-7b-chat-hf",
calculate_kv_scales=True) kv_cache_dtype="fp8",
calculate_kv_scales=True,
)
prompt = "London is the capital of" prompt = "London is the capital of"
out = llm.generate(prompt, sampling_params)[0].outputs[0].text out = llm.generate(prompt, sampling_params)[0].outputs[0].text
print(out) print(out)
...@@ -80,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models ...@@ -80,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
# Select model and load it # Select model and load it
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset # Select calibration dataset
......
...@@ -48,7 +48,9 @@ to fetch model and tokenizer. ...@@ -48,7 +48,9 @@ to fetch model and tokenizer.
MAX_SEQ_LEN = 512 MAX_SEQ_LEN = 512
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
dtype="auto",
) )
model.eval() model.eval()
...@@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib ...@@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
text_data = dataset["text"][:NUM_CALIBRATION_DATA] text_data = dataset["text"][:NUM_CALIBRATION_DATA]
tokenized_outputs = tokenizer(text_data, return_tensors="pt", tokenized_outputs = tokenizer(
padding=True, truncation=True, max_length=MAX_SEQ_LEN) text_data,
calib_dataloader = DataLoader(tokenized_outputs['input_ids'], return_tensors="pt",
batch_size=BATCH_SIZE, drop_last=True) padding=True,
truncation=True,
max_length=MAX_SEQ_LEN,
)
calib_dataloader = DataLoader(
tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE,
drop_last=True,
)
``` ```
### 3. Set the Quantization Configuration ### 3. Set the Quantization Configuration
...@@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. ...@@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
load_quant_algo_config_from_file) load_quant_algo_config_from_file)
# Define fp8/per-tensor/static spec. # Define fp8/per-tensor/static spec.
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
is_dynamic=False).to_quantization_spec() observer_method="min_max",
is_dynamic=False,
).to_quantization_spec()
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, global_quant_config = QuantizationConfig(
weight=FP8_PER_TENSOR_SPEC) input_tensors=FP8_PER_TENSOR_SPEC,
weight=FP8_PER_TENSOR_SPEC,
)
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
kv_cache_quant_config = {name : kv_cache_quant_config = {
QuantizationConfig(input_tensors=global_quant_config.input_tensors, name: QuantizationConfig(
weight=global_quant_config.weight, input_tensors=global_quant_config.input_tensors,
output_tensors=KV_CACHE_SPEC) weight=global_quant_config.weight,
for name in kv_cache_layer_names_for_llama} output_tensors=KV_CACHE_SPEC,
)
for name in kv_cache_layer_names_for_llama
}
layer_quant_config = kv_cache_quant_config.copy() layer_quant_config = kv_cache_quant_config.copy()
# Define algorithm config by config file. # Define algorithm config by config file.
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
EXCLUDE_LAYERS = ["lm_head"] EXCLUDE_LAYERS = ["lm_head"]
...@@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. ...@@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
layer_quant_config=layer_quant_config, layer_quant_config=layer_quant_config,
kv_cache_quant_config=kv_cache_quant_config, kv_cache_quant_config=kv_cache_quant_config,
exclude=EXCLUDE_LAYERS, exclude=EXCLUDE_LAYERS,
algo_config=algo_config) algo_config=algo_config,
)
``` ```
### 4. Quantize the Model and Export ### 4. Quantize the Model and Export
...@@ -165,8 +182,11 @@ for more exporting format details. ...@@ -165,8 +182,11 @@ for more exporting format details.
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
with torch.no_grad(): with torch.no_grad():
exporter.export_safetensors_model(freezed_model, exporter.export_safetensors_model(
quant_config=quant_config, tokenizer=tokenizer) freezed_model,
quant_config=quant_config,
tokenizer=tokenizer,
)
``` ```
### 5. Evaluation in vLLM ### 5. Evaluation in vLLM
...@@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent ...@@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Create an LLM.
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", llm = LLM(
kv_cache_dtype='fp8',quantization='quark') model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
kv_cache_dtype="fp8",
quantization="quark",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
...@@ -231,9 +254,9 @@ python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \ ...@@ -231,9 +254,9 @@ python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
--tasks gsm8k --tasks gsm8k
``` ```
## Using MXFP4 models ## Using OCP MX (MXFP4, MXFP6) models
vLLM supports loading MXFP4 models quantized offline through AMD Quark, compliant with [Open Compute Project (OCP) specification](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf). vLLM supports loading MXFP4 and MXFP6 models quantized offline through AMD Quark, compliant with [Open Compute Project (OCP) specification](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
The scheme currently only supports dynamic quantization for activations. The scheme currently only supports dynamic quantization for activations.
...@@ -241,17 +264,53 @@ Example usage, after installing the latest AMD Quark release: ...@@ -241,17 +264,53 @@ Example usage, after installing the latest AMD Quark release:
```bash ```bash
vllm serve fxmarty/qwen_1.5-moe-a2.7b-mxfp4 --tensor-parallel-size 1 vllm serve fxmarty/qwen_1.5-moe-a2.7b-mxfp4 --tensor-parallel-size 1
# or, for a model using fp6 activations and fp4 weights:
vllm serve fxmarty/qwen1.5_moe_a2.7b_chat_w_fp4_a_fp6_e2m3 --tensor-parallel-size 1
``` ```
A simulation of the matrix multiplication execution in MXFP4 can be run on devices that do not support MXFP4 operations natively (e.g. AMD Instinct MI325, MI300 and MI250), dequantizing weights from MXFP4 to half precision on the fly, using a fused kernel. This is useful e.g. to evaluate MXFP4 models using vLLM, or alternatively to benefit from the ~4x memory savings (compared to float16 and bfloat16). A simulation of the matrix multiplication execution in MXFP4/MXFP6 can be run on devices that do not support OCP MX operations natively (e.g. AMD Instinct MI325, MI300 and MI250), dequantizing weights from FP4/FP6 to half precision on the fly, using a fused kernel. This is useful e.g. to evaluate FP4/FP6 models using vLLM, or alternatively to benefit from the ~2.5-4x memory savings (compared to float16 and bfloat16).
To generate offline models quantized using MXFP4 data type, the easiest approach is to use AMD Quark's [quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html), as an example: To generate offline models quantized using MXFP4 data type, the easiest approach is to use AMD Quark's [quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html), as an example:
```bash ```bash
python quantize_quark.py --model_dir Qwen/Qwen1.5-MoE-A2.7B-Chat \ python quantize_quark.py --model_dir Qwen/Qwen1.5-MoE-A2.7B-Chat \
--quant_scheme w_mxfp4_a_mxfp4_sym \ --quant_scheme w_mxfp4_a_mxfp4 \
--output_dir qwen_1.5-moe-a2.7b-mxfp4 \ --output_dir qwen_1.5-moe-a2.7b-mxfp4 \
--skip_evaluation \ --skip_evaluation \
--model_export hf_format \ --model_export hf_format \
--group_size 32 --group_size 32
``` ```
The current integration supports [all combination of FP4, FP6_E3M2, FP6_E2M3](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py) used for either weights or activations.
## Using Quark Quantized layerwise Auto Mixed Precision (AMP) Models
vLLM also supports loading layerwise mixed precision model quantized using AMD Quark. Currently, mixed scheme of {MXFP4, FP8} is supported, where FP8 here denotes for FP8 per-tensor scheme. More mixed precision schemes are planned to be supported in a near future, including
- Unquantized Linear and/or MoE layer(s) as an option for each layer, i.e., mixed of {MXFP4, FP8, BF16/FP16}
- MXFP6 quantization extension, i.e., {MXFP4, MXFP6, FP8, BF16/FP16}
Although one can maximize serving throughput using the lowest precision supported on a given device (e.g. MXFP4 for AMD Instinct MI355, FP8 for AMD Instinct MI300), these aggressive schemes can be detrimental to accuracy recovering from quantization on target tasks. Mixed precision allows to strike a balance between maximizing accuracy and throughput.
There are two steps to generate and deploy a mixed precision model quantized with AMD Quark, as shown below.
### 1. Quantize a model using mixed precision in AMD Quark
Firstly, the layerwise mixed-precision configuration for a given LLM model is searched and then quantized using AMD Quark. We will provide a detailed tutorial with Quark APIs later.
As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benefits. They are:
- amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
- amd/Mixtral-8x7B-Instruct-v0.1-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
- amd/Qwen3-8B-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
### 2. inference the quantized mixed precision model in vLLM
Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follow:
```bash
lm_eval --model vllm \
--model_args pretrained=amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8,tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.8,trust_remote_code=False \
--tasks mmlu \
--batch_size auto
```
...@@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht ...@@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
quantization_config = TorchAoConfig(Int8WeightOnlyConfig()) quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
quantized_model = AutoModelForCausalLM.from_pretrained( quantized_model = AutoModelForCausalLM.from_pretrained(
model_name, model_name,
torch_dtype="auto", dtype="auto",
device_map="auto", device_map="auto",
quantization_config=quantization_config quantization_config=quantization_config
) )
......
...@@ -2,7 +2,10 @@ ...@@ -2,7 +2,10 @@
vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions. vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
Reasoning models return an additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models. Reasoning models return an additional `reasoning` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
!!! warning
`reasoning` used to be called `reasoning_content`. For now, `reasoning_content` will continue to work. However, we encourage you to migrate to `reasoning` in case `reasoning_content` is removed in future.
## Supported Models ## Supported Models
...@@ -11,15 +14,20 @@ vLLM currently supports the following reasoning models: ...@@ -11,15 +14,20 @@ vLLM currently supports the following reasoning models:
| Model Series | Parser Name | Structured Output Support | Tool Calling | | Model Series | Parser Name | Structured Output Support | Tool Calling |
|--------------|-------------|------------------|-------------| |--------------|-------------|------------------|-------------|
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ | | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ | | [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
| [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
| [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ | | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
| [MiniMax-M2](https://huggingface.co/MiniMaxAI/MiniMax-M2) | `minimax_m2_append_think` | `json`, `regex` | ✅ |
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ | | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ | | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
!!! note !!! note
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`. The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
DeepSeek-V3.1 tool calling is supported in non-thinking mode.
## Quickstart ## Quickstart
...@@ -56,18 +64,18 @@ Next, make a request to the model that should return the reasoning content in th ...@@ -56,18 +64,18 @@ Next, make a request to the model that should return the reasoning content in th
# extra_body={"chat_template_kwargs": {"enable_thinking": False}} # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
response = client.chat.completions.create(model=model, messages=messages) response = client.chat.completions.create(model=model, messages=messages)
reasoning_content = response.choices[0].message.reasoning_content reasoning = response.choices[0].message.reasoning
content = response.choices[0].message.content content = response.choices[0].message.content
print("reasoning_content:", reasoning_content) print("reasoning:", reasoning)
print("content:", content) print("content:", content)
``` ```
The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion. The `reasoning` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion.
## Streaming chat completions ## Streaming chat completions
Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming). Streaming chat completions are also supported for reasoning models. The `reasoning` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
??? console "Json" ??? console "Json"
...@@ -83,7 +91,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni ...@@ -83,7 +91,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
"index": 0, "index": 0,
"delta": { "delta": {
"role": "assistant", "role": "assistant",
"reasoning_content": "is", "reasoning": "is",
}, },
"logprobs": null, "logprobs": null,
"finish_reason": null "finish_reason": null
...@@ -92,7 +100,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni ...@@ -92,7 +100,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
} }
``` ```
OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example: OpenAI Python client library does not officially support `reasoning` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning` attribute is present in the response. For example:
??? code ??? code
...@@ -115,27 +123,29 @@ OpenAI Python client library does not officially support `reasoning_content` att ...@@ -115,27 +123,29 @@ OpenAI Python client library does not officially support `reasoning_content` att
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
# For Qwen3 series, if you want to disable thinking in reasoning mode, add: # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
# extra_body={"chat_template_kwargs": {"enable_thinking": False}} # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
stream = client.chat.completions.create(model=model, stream = client.chat.completions.create(
messages=messages, model=model,
stream=True) messages=messages,
stream=True,
)
print("client: Start streaming chat completions...") print("client: Start streaming chat completions...")
printed_reasoning_content = False printed_reasoning = False
printed_content = False printed_content = False
for chunk in stream: for chunk in stream:
# Safely extract reasoning_content and content from delta, # Safely extract reasoning and content from delta,
# defaulting to None if attributes don't exist or are empty strings # defaulting to None if attributes don't exist or are empty strings
reasoning_content = ( reasoning = (
getattr(chunk.choices[0].delta, "reasoning_content", None) or None getattr(chunk.choices[0].delta, "reasoning", None) or None
) )
content = getattr(chunk.choices[0].delta, "content", None) or None content = getattr(chunk.choices[0].delta, "content", None) or None
if reasoning_content is not None: if reasoning is not None:
if not printed_reasoning_content: if not printed_reasoning:
printed_reasoning_content = True printed_reasoning = True
print("reasoning_content:", end="", flush=True) print("reasoning:", end="", flush=True)
print(reasoning_content, end="", flush=True) print(reasoning, end="", flush=True)
elif content is not None: elif content is not None:
if not printed_content: if not printed_content:
printed_content = True printed_content = True
...@@ -144,11 +154,11 @@ OpenAI Python client library does not officially support `reasoning_content` att ...@@ -144,11 +154,11 @@ OpenAI Python client library does not officially support `reasoning_content` att
print(content, end="", flush=True) print(content, end="", flush=True)
``` ```
Remember to check whether the `reasoning_content` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py). Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
## Tool Calling ## Tool Calling
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`. The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning`.
??? code ??? code
...@@ -157,38 +167,40 @@ The reasoning content is also available when both tool calling and the reasoning ...@@ -157,38 +167,40 @@ The reasoning content is also available when both tool calling and the reasoning
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
tools = [{ tools = [
"type": "function", {
"function": { "type": "function",
"name": "get_weather", "function": {
"description": "Get the current weather in a given location", "name": "get_weather",
"parameters": { "description": "Get the current weather in a given location",
"type": "object", "parameters": {
"properties": { "type": "object",
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, "properties": {
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
"required": ["location", "unit"] },
} "required": ["location", "unit"],
}
},
} }
}] ]
response = client.chat.completions.create( response = client.chat.completions.create(
model=client.models.list().data[0].id, model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools, tools=tools,
tool_choice="auto" tool_choice="auto",
) )
print(response) print(response)
tool_call = response.choices[0].message.tool_calls[0].function tool_call = response.choices[0].message.tool_calls[0].function
print(f"reasoning_content: {response.choices[0].message.reasoning_content}") print(f"reasoning: {response.choices[0].message.reasoning}")
print(f"Function called: {tool_call.name}") print(f"Function called: {tool_call.name}")
print(f"Arguments: {tool_call.arguments}") print(f"Arguments: {tool_call.arguments}")
``` ```
For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py>. For more examples, please refer to [examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py](../../examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py).
## Limitations ## Limitations
...@@ -196,7 +208,7 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_ ...@@ -196,7 +208,7 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
## How to support a new reasoning model ## How to support a new reasoning model
You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>. You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).
??? code ??? code
...@@ -210,12 +222,11 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_ ...@@ -210,12 +222,11 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
# define a reasoning parser and register it to vllm # define a reasoning parser and register it to vllm
# the name list in register_module can be used # the name list in register_module can be used
# in --reasoning-parser. # in --reasoning-parser.
@ReasoningParserManager.register_module(["example"])
class ExampleParser(ReasoningParser): class ExampleParser(ReasoningParser):
def __init__(self, tokenizer: AnyTokenizer): def __init__(self, tokenizer: AnyTokenizer):
super().__init__(tokenizer) super().__init__(tokenizer)
def extract_reasoning_content_streaming( def extract_reasoning_streaming(
self, self,
previous_text: str, previous_text: str,
current_text: str, current_text: str,
...@@ -223,7 +234,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_ ...@@ -223,7 +234,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
previous_token_ids: Sequence[int], previous_token_ids: Sequence[int],
current_token_ids: Sequence[int], current_token_ids: Sequence[int],
delta_token_ids: Sequence[int], delta_token_ids: Sequence[int],
) -> Union[DeltaMessage, None]: ) -> DeltaMessage | None:
""" """
Instance method that should be implemented for extracting reasoning Instance method that should be implemented for extracting reasoning
from an incomplete response; for use when handling reasoning calls and from an incomplete response; for use when handling reasoning calls and
...@@ -232,9 +243,11 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_ ...@@ -232,9 +243,11 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
previously been parsed and extracted (see constructor) previously been parsed and extracted (see constructor)
""" """
def extract_reasoning_content( def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest self,
) -> tuple[Optional[str], Optional[str]]: model_output: str,
request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]:
""" """
Extract reasoning content from a complete model-generated string. Extract reasoning content from a complete model-generated string.
...@@ -252,9 +265,15 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_ ...@@ -252,9 +265,15 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
tuple[Optional[str], Optional[str]] tuple[Optional[str], Optional[str]]
A tuple containing the reasoning content and the content. A tuple containing the reasoning content and the content.
""" """
# Register the reasoning parser
ReasoningParserManager.register_lazy_module(
name="example",
module_path="vllm.reasoning.example_reasoning_parser",
class_name="ExampleParser",
)
``` ```
Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>. Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in [vllm/reasoning/deepseek_r1_reasoning_parser.py](../../vllm/reasoning/deepseek_r1_reasoning_parser.py).
??? code ??? code
...@@ -272,10 +291,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner ...@@ -272,10 +291,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
@classmethod @classmethod
def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner: def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
return cls(start_token_id=tokenizer.encode( return cls(
"<think>", add_special_tokens=False)[0], start_token_id=tokenizer.encode("<think>", add_special_tokens=False)[0],
end_token_id=tokenizer.encode("</think>", end_token_id=tokenizer.encode("</think>", add_special_tokens=False)[0],
add_special_tokens=False)[0]) )
def is_reasoning_end(self, input_ids: list[int]) -> bool: def is_reasoning_end(self, input_ids: list[int]) -> bool:
return self.end_token_id in input_ids return self.end_token_id in input_ids
......
...@@ -11,7 +11,10 @@ Key benefits: ...@@ -11,7 +11,10 @@ Key benefits:
- **Fine-grained control**: Optionally wake up only model weights or KV cache to avoid OOM during weight updates. - **Fine-grained control**: Optionally wake up only model weights or KV cache to avoid OOM during weight updates.
!!! note !!! note
This feature is only supported on CUDA platform. This feature is now supported on CUDA and ROCm platform.
!!! note
For more information, see this [Blog Post](https://blog.vllm.ai/2025/10/26/sleep-mode.html).
## Sleep levels ## Sleep levels
...@@ -31,6 +34,7 @@ llm = LLM("Qwen/Qwen3-0.6B", enable_sleep_mode=True) ...@@ -31,6 +34,7 @@ llm = LLM("Qwen/Qwen3-0.6B", enable_sleep_mode=True)
#### Python API #### Python API
```python ```python
# Sleep level 1
# Put the engine to sleep (level=1: offload weights to CPU RAM, discard KV cache) # Put the engine to sleep (level=1: offload weights to CPU RAM, discard KV cache)
llm.sleep(level=1) llm.sleep(level=1)
...@@ -38,6 +42,21 @@ llm.sleep(level=1) ...@@ -38,6 +42,21 @@ llm.sleep(level=1)
llm.wake_up() llm.wake_up()
``` ```
```python
# Sleep level 2
# Put the engine to sleep (level=2: discard both weights and KV cache)
llm.sleep(level=2)
# Reallocate weights memory only
llm.wake_up(tags=["weights"])
# Load weights in-place
llm.collective_rpc("reload_weights")
# Reallocate KV cache
llm.wake_up(tags=["kv_cache"])
```
#### RLHF weight updates #### RLHF weight updates
During RLHF training, vLLM allows you to selectively wake up only the model weights or the KV cache using the tags argument in wake_up(). This fine-grained control is especially useful when updating model weights: by waking up just the weights (e.g., llm.wake_up(tags=["weights"])), you avoid allocating memory for the KV cache until after the weight update is complete. This approach helps prevent GPU out-of-memory (OOM) errors, particularly with large models, by minimizing peak memory usage during weight synchronization and update operations. During RLHF training, vLLM allows you to selectively wake up only the model weights or the KV cache using the tags argument in wake_up(). This fine-grained control is especially useful when updating model weights: by waking up just the weights (e.g., llm.wake_up(tags=["weights"])), you avoid allocating memory for the KV cache until after the weight update is complete. This approach helps prevent GPU out-of-memory (OOM) errors, particularly with large models, by minimizing peak memory usage during weight synchronization and update operations.
...@@ -64,17 +83,40 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V ...@@ -64,17 +83,40 @@ To enable sleep mode in a vLLM server you need to initialize it with the flag `V
When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users. When using the flag `VLLM_SERVER_DEV_MODE=1` you enable development endpoints, and these endpoints should not be exposed to users.
```bash ```bash
VLLM_SERVER_DEV_MODE=1 python -m vllm.entrypoints.openai.api_server \ VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-0.6B \
--model Qwen/Qwen3-0.6B \
--enable-sleep-mode \ --enable-sleep-mode \
--port 8000 --port 8000
``` ```
Below is an example of how to sleep and wake up a model in level 1.
```bash
curl -X POST 'http://localhost:8000/sleep?level=1'
curl -X POST 'http://localhost:8000/wake_up'
```
And this is an example of how to sleep and wake up a model in level 2.
```bash
curl -X POST 'http://localhost:8000/sleep?level=2'
# Reallocate weights memory only
curl -X POST 'http://localhost:8000/wake_up?tags=weights'
# Load weights in-place
curl -X POST 'http://localhost:8000/collective_rpc' -H 'Content-Type: application/json' -d '{"method":"reload_weights"}'
# Reallocate KV cache
curl -X POST 'http://localhost:8000/wake_up?tags=kv_cache'
```
#### HTTP endpoints #### HTTP endpoints
- `POST /sleep?level=1` — Put the model to sleep (`level=1`). - `POST /sleep?level=1` — Put the model to sleep (`level=1`).
- `POST /wake_up` — Wake up the model. Supports optional `tags` query parameters for partial wake-up (e.g., `?tags=weights`). - `POST /wake_up` — Wake up the model. Supports optional `tags` query parameters for partial wake-up (e.g., `?tags=weights`).
- `POST /collective_rpc` — Perform a collective remote procedure call (RPC).
- `GET /is_sleeping` — Check if the model is sleeping. - `GET /is_sleeping` — Check if the model is sleeping.
!!! note !!! note
These endpoints are only available when passing `VLLM_SERVER_DEV_MODE=1`. These endpoints are only available when passing `VLLM_SERVER_DEV_MODE=1`.
## Limitation
On ROCm, the virtual memory allocation on ROCm is done through chunked memory allocation. You can control the chunk size through `VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE` (in MB). The default value is set at 256MB. The larger the chunk size the faster the performance. However, setting it too large will cause OOM. So if you encounter OOM when using sleep mode. Try reducing the chunk size. It is recommended to define the chunk size as a power of 2.
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
!!! warning !!! warning
Please note that speculative decoding in vLLM is not yet optimized and does Please note that speculative decoding in vLLM is not yet optimized and does
not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
The work to optimize it is ongoing and can be followed here: <gh-issue:4630> The work to optimize it is ongoing and can be followed here: <https://github.com/vllm-project/vllm/issues/4630>
!!! warning !!! warning
Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
...@@ -48,10 +48,9 @@ The following code configures vLLM in an offline mode to use speculative decodin ...@@ -48,10 +48,9 @@ The following code configures vLLM in an offline mode to use speculative decodin
To perform the same with an online mode launch the server: To perform the same with an online mode launch the server:
```bash ```bash
python -m vllm.entrypoints.openai.api_server \ vllm serve facebook/opt-6.7b \
--host 0.0.0.0 \ --host 0.0.0.0 \
--port 8000 \ --port 8000 \
--model facebook/opt-6.7b \
--seed 42 \ --seed 42 \
-tp 1 \ -tp 1 \
--gpu_memory_utilization 0.8 \ --gpu_memory_utilization 0.8 \
...@@ -131,6 +130,46 @@ matching n-grams in the prompt. For more information read [this thread.](https:/ ...@@ -131,6 +130,46 @@ matching n-grams in the prompt. For more information read [this thread.](https:/
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` ```
## Speculating using Suffix Decoding
The following code configures vLLM to use speculative decoding where proposals are generated using Suffix Decoding ([technical report](https://arxiv.org/abs/2411.04975)).
Like n-gram, Suffix Decoding can generate draft tokens by pattern-matching using the last `n` generated tokens. Unlike n-gram, Suffix Decoding (1) can pattern-match against both the prompt and previous generations, (2) uses frequency counts to propose the most likely continuations, and (3) speculates an adaptive number of tokens for each request at each iteration to get better acceptance rates.
Suffix Decoding can achieve better performance for tasks with high repetition, such as code-editing, agentic loops (e.g. self-reflection, self-consistency), and RL rollouts.
!!! tip "Install Arctic Inference"
Suffix Decoding requires [Arctic Inference](https://github.com/snowflakedb/ArcticInference). You can install it with `pip install arctic-inference`.
!!! tip "Suffix Decoding Speculative Tokens"
Suffix Decoding will speculate a dynamic number of tokens for each request at each decoding step, so the `num_speculative_tokens` configuration specifies the *maximum* number of speculative tokens. It is suggested to use a high number such as `16` or `32` (default).
??? code
```python
from vllm import LLM, SamplingParams
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(
model="facebook/opt-6.7b",
tensor_parallel_size=1,
speculative_config={
"method": "suffix",
"num_speculative_tokens": 32,
},
)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
## Speculating using MLP speculators ## Speculating using MLP speculators
The following code configures vLLM to use speculative decoding where proposals are generated by The following code configures vLLM to use speculative decoding where proposals are generated by
...@@ -184,7 +223,7 @@ A variety of speculative models of this type are available on HF hub: ...@@ -184,7 +223,7 @@ A variety of speculative models of this type are available on HF hub:
## Speculating using EAGLE based draft models ## Speculating using EAGLE based draft models
The following code configures vLLM to use speculative decoding where proposals are generated by The following code configures vLLM to use speculative decoding where proposals are generated by
an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py). an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](../../examples/offline_inference/spec_decode.py).
??? code ??? code
...@@ -219,8 +258,8 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https ...@@ -219,8 +258,8 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
A few important things to consider when using the EAGLE based draft models: A few important things to consider when using the EAGLE based draft models:
1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
be able to be loaded and used directly by vLLM after <gh-pr:12304>. be able to be loaded and used directly by vLLM after <https://github.com/vllm-project/vllm/pull/12304>.
If you are using vllm version before <gh-pr:12304>, please use the If you are using vllm version before <https://github.com/vllm-project/vllm/pull/12304>, please use the
[script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model, [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue. and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
...@@ -230,7 +269,7 @@ A few important things to consider when using the EAGLE based draft models: ...@@ -230,7 +269,7 @@ A few important things to consider when using the EAGLE based draft models:
3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is 3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
investigation and tracked here: <gh-issue:9565>. investigation and tracked here: <https://github.com/vllm-project/vllm/issues/9565>.
4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3". 4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3".
That is, to specify `"method": "eagle3"` in `speculative_config`. That is, to specify `"method": "eagle3"` in `speculative_config`.
...@@ -268,7 +307,7 @@ speculative decoding, breaking down the guarantees into three key areas: ...@@ -268,7 +307,7 @@ speculative decoding, breaking down the guarantees into three key areas:
> distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252) > distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252)
> - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling
> without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, > without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler,
> provides a lossless guarantee. Almost all of the tests in <gh-dir:tests/spec_decode/e2e>. > provides a lossless guarantee. Almost all of the tests in [tests/spec_decode/e2e](../../tests/spec_decode/e2e).
> verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291) > verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291)
3. **vLLM Logprob Stability** 3. **vLLM Logprob Stability**
...@@ -290,4 +329,4 @@ For mitigation strategies, please refer to the FAQ entry *Can the output of a pr ...@@ -290,4 +329,4 @@ For mitigation strategies, please refer to the FAQ entry *Can the output of a pr
- [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4) - [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4)
- [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a) - [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a)
- [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8) - [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8)
- [Dynamic speculative decoding](gh-issue:4565) - [Dynamic speculative decoding](https://github.com/vllm-project/vllm/issues/4565)
...@@ -204,7 +204,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th ...@@ -204,7 +204,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th
} }
}, },
) )
print("reasoning_content: ", completion.choices[0].message.reasoning_content) print("reasoning: ", completion.choices[0].message.reasoning)
print("content: ", completion.choices[0].message.content) print("content: ", completion.choices[0].message.content)
``` ```
...@@ -298,7 +298,7 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa ...@@ -298,7 +298,7 @@ Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equa
Answer: x = -29/8 Answer: x = -29/8
``` ```
An example of using `structural_tag` can be found here: <gh-file:examples/online_serving/structured_outputs> An example of using `structural_tag` can be found here: [examples/online_serving/structured_outputs](../../examples/online_serving/structured_outputs)
## Offline Inference ## Offline Inference
......
...@@ -27,27 +27,29 @@ Next, make a request that triggers the model to use the available tools: ...@@ -27,27 +27,29 @@ Next, make a request that triggers the model to use the available tools:
return f"Getting the weather for {location} in {unit}..." return f"Getting the weather for {location} in {unit}..."
tool_functions = {"get_weather": get_weather} tool_functions = {"get_weather": get_weather}
tools = [{ tools = [
"type": "function", {
"function": { "type": "function",
"name": "get_weather", "function": {
"description": "Get the current weather in a given location", "name": "get_weather",
"parameters": { "description": "Get the current weather in a given location",
"type": "object", "parameters": {
"properties": { "type": "object",
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, "properties": {
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["location", "unit"],
}, },
"required": ["location", "unit"] },
} },
} ]
}]
response = client.chat.completions.create( response = client.chat.completions.create(
model=client.models.list().data[0].id, model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools, tools=tools,
tool_choice="auto" tool_choice="auto",
) )
tool_call = response.choices[0].message.tool_calls[0].function tool_call = response.choices[0].message.tool_calls[0].function
...@@ -145,16 +147,23 @@ Supported models: ...@@ -145,16 +147,23 @@ Supported models:
Known issues: Known issues:
1. Mistral 7B struggles to generate parallel tool calls correctly. 1. Mistral 7B struggles to generate parallel tool calls correctly.
2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is 2. **For Transformers tokenization backend only**: Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
much shorter than what vLLM generates. Since an exception is thrown when this condition much shorter than what vLLM generates. Since an exception is thrown when this condition
is not met, the following additional chat templates are provided: is not met, the following additional chat templates are provided:
* <gh-file:examples/tool_chat_template_mistral.jinja> - this is the "official" Mistral chat template, but tweaked so that * [examples/tool_chat_template_mistral.jinja](../../examples/tool_chat_template_mistral.jinja) - this is the "official" Mistral chat template, but tweaked so that
it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits) it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits)
* <gh-file:examples/tool_chat_template_mistral_parallel.jinja> - this is a "better" version that adds a tool-use system prompt * [examples/tool_chat_template_mistral_parallel.jinja](../../examples/tool_chat_template_mistral_parallel.jinja) - this is a "better" version that adds a tool-use system prompt
when tools are provided, that results in much better reliability when working with parallel tool calling. when tools are provided, that results in much better reliability when working with parallel tool calling.
Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` Recommended flags:
1. To use [mistral-common](https://github.com/mistralai/mistral-common) the official Mistral tokenization backend:
`--tokenizer_mode mistral --config_format mistral --load_format mistral --tool-call-parser mistral`
2. To use the default Transformers tokenization backend:
`--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
### Llama Models (`llama3_json`) ### Llama Models (`llama3_json`)
...@@ -178,28 +187,32 @@ Known issues: ...@@ -178,28 +187,32 @@ Known issues:
VLLM provides two JSON-based chat templates for Llama 3.1 and 3.2: VLLM provides two JSON-based chat templates for Llama 3.1 and 3.2:
* <gh-file:examples/tool_chat_template_llama3.1_json.jinja> - this is the "official" chat template for the Llama 3.1 * [examples/tool_chat_template_llama3.1_json.jinja](../../examples/tool_chat_template_llama3.1_json.jinja) - this is the "official" chat template for the Llama 3.1
models, but tweaked so that it works better with vLLM. models, but tweaked so that it works better with vLLM.
* <gh-file:examples/tool_chat_template_llama3.2_json.jinja> - this extends upon the Llama 3.1 chat template by adding support for * [examples/tool_chat_template_llama3.2_json.jinja](../../examples/tool_chat_template_llama3.2_json.jinja) - this extends upon the Llama 3.1 chat template by adding support for
images. images.
Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}` Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
VLLM also provides a pythonic and JSON-based chat template for Llama 4, but pythonic tool calling is recommended: VLLM also provides a pythonic and JSON-based chat template for Llama 4, but pythonic tool calling is recommended:
* <gh-file:examples/tool_chat_template_llama4_pythonic.jinja> - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models. * [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja) - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.
For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`. For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.
#### IBM Granite ### IBM Granite
Supported models: Supported models:
* `ibm-granite/granite-4.0-h-small` and other Granite 4.0 models
Recommended flags: `--tool-call-parser hermes`
* `ibm-granite/granite-3.0-8b-instruct` * `ibm-granite/granite-3.0-8b-instruct`
Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja` Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
<gh-file:examples/tool_chat_template_granite.jinja>: this is a modified chat template from the original on Hugging Face. Parallel function calls are supported. [examples/tool_chat_template_granite.jinja](../../examples/tool_chat_template_granite.jinja): this is a modified chat template from the original on Hugging Face. Parallel function calls are supported.
* `ibm-granite/granite-3.1-8b-instruct` * `ibm-granite/granite-3.1-8b-instruct`
...@@ -211,7 +224,7 @@ Supported models: ...@@ -211,7 +224,7 @@ Supported models:
Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja` Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
<gh-file:examples/tool_chat_template_granite_20b_fc.jinja>: this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. [examples/tool_chat_template_granite_20b_fc.jinja](../../examples/tool_chat_template_granite_20b_fc.jinja): this is a modified chat template from the original on Hugging Face, which is not vLLM-compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
### InternLM Models (`internlm`) ### InternLM Models (`internlm`)
...@@ -269,8 +282,8 @@ Flags: `--tool-call-parser hermes` ...@@ -269,8 +282,8 @@ Flags: `--tool-call-parser hermes`
Supported models: Supported models:
* `MiniMaxAi/MiniMax-M1-40k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>) * `MiniMaxAi/MiniMax-M1-40k` (use with [examples/tool_chat_template_minimax_m1.jinja](../../examples/tool_chat_template_minimax_m1.jinja))
* `MiniMaxAi/MiniMax-M1-80k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>) * `MiniMaxAi/MiniMax-M1-80k` (use with [examples/tool_chat_template_minimax_m1.jinja](../../examples/tool_chat_template_minimax_m1.jinja))
Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja` Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja`
...@@ -278,8 +291,8 @@ Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_m ...@@ -278,8 +291,8 @@ Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_m
Supported models: Supported models:
* `deepseek-ai/DeepSeek-V3-0324` (use with <gh-file:examples/tool_chat_template_deepseekv3.jinja>) * `deepseek-ai/DeepSeek-V3-0324` (use with [examples/tool_chat_template_deepseekv3.jinja](../../examples/tool_chat_template_deepseekv3.jinja))
* `deepseek-ai/DeepSeek-R1-0528` (use with <gh-file:examples/tool_chat_template_deepseekr1.jinja>) * `deepseek-ai/DeepSeek-R1-0528` (use with [examples/tool_chat_template_deepseekr1.jinja](../../examples/tool_chat_template_deepseekr1.jinja))
Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}` Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
...@@ -287,7 +300,7 @@ Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}` ...@@ -287,7 +300,7 @@ Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}`
Supported models: Supported models:
* `deepseek-ai/DeepSeek-V3.1` (use with <gh-file:examples/tool_chat_template_deepseekv31.jinja>) * `deepseek-ai/DeepSeek-V3.1` (use with [examples/tool_chat_template_deepseekv31.jinja](../../examples/tool_chat_template_deepseekv31.jinja))
Flags: `--tool-call-parser deepseek_v31 --chat-template {see_above}` Flags: `--tool-call-parser deepseek_v31 --chat-template {see_above}`
...@@ -308,7 +321,7 @@ Supported models: ...@@ -308,7 +321,7 @@ Supported models:
Flags: Flags:
* For non-reasoning: `--tool-call-parser hunyuan_a13b` * For non-reasoning: `--tool-call-parser hunyuan_a13b`
* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning` * For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b`
### LongCat-Flash-Chat Models (`longcat`) ### LongCat-Flash-Chat Models (`longcat`)
...@@ -323,8 +336,10 @@ Flags: `--tool-call-parser longcat` ...@@ -323,8 +336,10 @@ Flags: `--tool-call-parser longcat`
Supported models: Supported models:
* `ZhipuAI/GLM-4.5` * `zai-org/GLM-4.5`
* `ZhipuAI/GLM-4.5-Air` * `zai-org/GLM-4.5-Air`
* `zai-org/GLM-4.6`
* `zai-org/GLM-4.6-Air`
Flags: `--tool-call-parser glm45` Flags: `--tool-call-parser glm45`
...@@ -337,6 +352,16 @@ Supported models: ...@@ -337,6 +352,16 @@ Supported models:
Flags: `--tool-call-parser qwen3_xml` Flags: `--tool-call-parser qwen3_xml`
### Olmo 3 Models (`olmo3`)
Olmo 3 models output tool calls in a format that is very similar to the one expected by the `pythonic` parser (see below), with a few differences. Each tool call is a pythonic string, but the parallel tool calls are newline-delimited, and the calls are wrapped within XML tags as `<function_calls>..</function_calls>`. In addition, the parser also allows JSON boolean and null literals (`true`, `false`, and `null`) in addition to the pythonic ones (`True`, `False`, and `None`).
Supported models:
* TODO (will be updated after Olmo 3 release)
Flags: `--tool-call-parser olmo3`
### Models with Pythonic Tool Calls (`pythonic`) ### Models with Pythonic Tool Calls (`pythonic`)
A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
...@@ -354,12 +379,12 @@ Limitations: ...@@ -354,12 +379,12 @@ Limitations:
Example supported models: Example supported models:
* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>) * `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with [examples/tool_chat_template_llama3.2_pythonic.jinja](../../examples/tool_chat_template_llama3.2_pythonic.jinja))
* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>) * `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with [examples/tool_chat_template_llama3.2_pythonic.jinja](../../examples/tool_chat_template_llama3.2_pythonic.jinja))
* `Team-ACE/ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>) * `Team-ACE/ToolACE-8B` (use with [examples/tool_chat_template_toolace.jinja](../../examples/tool_chat_template_toolace.jinja))
* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>) * `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with [examples/tool_chat_template_toolace.jinja](../../examples/tool_chat_template_toolace.jinja))
* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>) * `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja))
* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>) * `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with [examples/tool_chat_template_llama4_pythonic.jinja](../../examples/tool_chat_template_llama4_pythonic.jinja))
Flags: `--tool-call-parser pythonic --chat-template {see_above}` Flags: `--tool-call-parser pythonic --chat-template {see_above}`
...@@ -368,7 +393,7 @@ Flags: `--tool-call-parser pythonic --chat-template {see_above}` ...@@ -368,7 +393,7 @@ Flags: `--tool-call-parser pythonic --chat-template {see_above}`
## How to Write a Tool Parser Plugin ## How to Write a Tool Parser Plugin
A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in <gh-file:vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py>. A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in [vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py](../../vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py).
Here is a summary of a plugin file: Here is a summary of a plugin file:
...@@ -382,15 +407,13 @@ Here is a summary of a plugin file: ...@@ -382,15 +407,13 @@ Here is a summary of a plugin file:
# the name list in register_module can be used # the name list in register_module can be used
# in --tool-call-parser. you can define as many # in --tool-call-parser. you can define as many
# tool parsers as you want here. # tool parsers as you want here.
@ToolParserManager.register_module(["example"])
class ExampleToolParser(ToolParser): class ExampleToolParser(ToolParser):
def __init__(self, tokenizer: AnyTokenizer): def __init__(self, tokenizer: AnyTokenizer):
super().__init__(tokenizer) super().__init__(tokenizer)
# adjust request. e.g.: set skip special tokens # adjust request. e.g.: set skip special tokens
# to False for tool call output. # to False for tool call output.
def adjust_request( def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
return request return request
# implement the tool call parse for stream call # implement the tool call parse for stream call
...@@ -403,7 +426,7 @@ Here is a summary of a plugin file: ...@@ -403,7 +426,7 @@ Here is a summary of a plugin file:
current_token_ids: Sequence[int], current_token_ids: Sequence[int],
delta_token_ids: Sequence[int], delta_token_ids: Sequence[int],
request: ChatCompletionRequest, request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]: ) -> DeltaMessage | None:
return delta return delta
# implement the tool parse for non-stream call # implement the tool parse for non-stream call
...@@ -415,6 +438,12 @@ Here is a summary of a plugin file: ...@@ -415,6 +438,12 @@ Here is a summary of a plugin file:
return ExtractedToolCallInformation(tools_called=False, return ExtractedToolCallInformation(tools_called=False,
tool_calls=[], tool_calls=[],
content=text) content=text)
# register the tool parser to ToolParserManager
ToolParserManager.register_lazy_module(
name="example",
module_path="vllm.entrypoints.openai.tool_parsers.example",
class_name="ExampleToolParser",
)
``` ```
......
...@@ -2,4 +2,4 @@ nav: ...@@ -2,4 +2,4 @@ nav:
- README.md - README.md
- gpu.md - gpu.md
- cpu.md - cpu.md
- google_tpu.md - TPU: https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/
...@@ -11,7 +11,6 @@ vLLM supports the following hardware platforms: ...@@ -11,7 +11,6 @@ vLLM supports the following hardware platforms:
- [ARM AArch64](cpu.md#arm-aarch64) - [ARM AArch64](cpu.md#arm-aarch64)
- [Apple silicon](cpu.md#apple-silicon) - [Apple silicon](cpu.md#apple-silicon)
- [IBM Z (S390X)](cpu.md#ibm-z-s390x) - [IBM Z (S390X)](cpu.md#ibm-z-s390x)
- [Google TPU](google_tpu.md)
## Hardware Plugins ## Hardware Plugins
...@@ -20,8 +19,10 @@ The backends below live **outside** the main `vllm` repository and follow the ...@@ -20,8 +19,10 @@ The backends below live **outside** the main `vllm` repository and follow the
| Accelerator | PyPI / package | Repository | | Accelerator | PyPI / package | Repository |
|-------------|----------------|------------| |-------------|----------------|------------|
| Google TPU | `tpu-inference` | <https://github.com/vllm-project/tpu-inference> |
| Ascend NPU | `vllm-ascend` | <https://github.com/vllm-project/vllm-ascend> | | Ascend NPU | `vllm-ascend` | <https://github.com/vllm-project/vllm-ascend> |
| Intel Gaudi (HPU) | N/A, install from source | <https://github.com/vllm-project/vllm-gaudi> | | Intel Gaudi (HPU) | N/A, install from source | <https://github.com/vllm-project/vllm-gaudi> |
| MetaX MACA GPU | N/A, install from source | <https://github.com/MetaX-MACA/vLLM-metax> | | MetaX MACA GPU | N/A, install from source | <https://github.com/MetaX-MACA/vLLM-metax> |
| Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> | | Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> | | IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
| Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
...@@ -28,10 +28,15 @@ After installation of XCode and the Command Line Tools, which include Apple Clan ...@@ -28,10 +28,15 @@ After installation of XCode and the Command Line Tools, which include Apple Clan
```bash ```bash
git clone https://github.com/vllm-project/vllm.git git clone https://github.com/vllm-project/vllm.git
cd vllm cd vllm
uv pip install -r requirements/cpu.txt uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
uv pip install -e . uv pip install -e .
``` ```
!!! tip
The `--index-strategy unsafe-best-match` flag is needed to resolve dependencies across multiple package indexes (PyTorch CPU index and PyPI). Without this flag, you may encounter `typing-extensions` version conflicts.
The term "unsafe" refers to the package resolution strategy, not security. By default, `uv` only searches the first index where a package is found to prevent dependency confusion attacks. This flag allows `uv` to search all configured indexes to find the best compatible versions. Since both PyTorch and PyPI are trusted package sources, using this strategy is safe and appropriate for vLLM installation.
!!! note !!! note
On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device. On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device.
......
...@@ -23,7 +23,46 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. ...@@ -23,7 +23,46 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
# --8<-- [end:pre-built-wheels] # --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source] # --8<-- [start:build-wheel-from-source]
--8<-- "docs/getting_started/installation/cpu/build.inc.md" First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
```bash
sudo apt-get update -y
sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
```
Second, clone the vLLM project:
```bash
git clone https://github.com/vllm-project/vllm.git vllm_source
cd vllm_source
```
Third, install required dependencies:
```bash
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
uv pip install -r requirements/cpu.txt --torch-backend cpu
```
??? console "pip"
```bash
pip install --upgrade pip
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
```
Finally, build and install vLLM:
```bash
VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
```
If you want to develop vLLM, install it in editable mode instead.
```bash
VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
```
Testing has been conducted on AWS Graviton3 instances for compatibility. Testing has been conducted on AWS Graviton3 instances for compatibility.
......
...@@ -4,39 +4,39 @@ vLLM is a Python library that supports the following CPU variants. Select your C ...@@ -4,39 +4,39 @@ vLLM is a Python library that supports the following CPU variants. Select your C
=== "Intel/AMD x86" === "Intel/AMD x86"
--8<-- "docs/getting_started/installation/cpu/x86.inc.md:installation" --8<-- "docs/getting_started/installation/cpu.x86.inc.md:installation"
=== "ARM AArch64" === "ARM AArch64"
--8<-- "docs/getting_started/installation/cpu/arm.inc.md:installation" --8<-- "docs/getting_started/installation/cpu.arm.inc.md:installation"
=== "Apple silicon" === "Apple silicon"
--8<-- "docs/getting_started/installation/cpu/apple.inc.md:installation" --8<-- "docs/getting_started/installation/cpu.apple.inc.md:installation"
=== "IBM Z (S390X)" === "IBM Z (S390X)"
--8<-- "docs/getting_started/installation/cpu/s390x.inc.md:installation" --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:installation"
## Requirements ## Requirements
- Python: 3.9 -- 3.12 - Python: 3.10 -- 3.13
=== "Intel/AMD x86" === "Intel/AMD x86"
--8<-- "docs/getting_started/installation/cpu/x86.inc.md:requirements" --8<-- "docs/getting_started/installation/cpu.x86.inc.md:requirements"
=== "ARM AArch64" === "ARM AArch64"
--8<-- "docs/getting_started/installation/cpu/arm.inc.md:requirements" --8<-- "docs/getting_started/installation/cpu.arm.inc.md:requirements"
=== "Apple silicon" === "Apple silicon"
--8<-- "docs/getting_started/installation/cpu/apple.inc.md:requirements" --8<-- "docs/getting_started/installation/cpu.apple.inc.md:requirements"
=== "IBM Z (S390X)" === "IBM Z (S390X)"
--8<-- "docs/getting_started/installation/cpu/s390x.inc.md:requirements" --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:requirements"
## Set up using Python ## Set up using Python
...@@ -52,19 +52,19 @@ Currently, there are no pre-built CPU wheels. ...@@ -52,19 +52,19 @@ Currently, there are no pre-built CPU wheels.
=== "Intel/AMD x86" === "Intel/AMD x86"
--8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-wheel-from-source" --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
=== "ARM AArch64" === "ARM AArch64"
--8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-wheel-from-source" --8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-wheel-from-source"
=== "Apple silicon" === "Apple silicon"
--8<-- "docs/getting_started/installation/cpu/apple.inc.md:build-wheel-from-source" --8<-- "docs/getting_started/installation/cpu.apple.inc.md:build-wheel-from-source"
=== "IBM Z (s390x)" === "IBM Z (s390x)"
--8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-wheel-from-source" --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:build-wheel-from-source"
## Set up using Docker ## Set up using Docker
...@@ -72,29 +72,29 @@ Currently, there are no pre-built CPU wheels. ...@@ -72,29 +72,29 @@ Currently, there are no pre-built CPU wheels.
=== "Intel/AMD x86" === "Intel/AMD x86"
--8<-- "docs/getting_started/installation/cpu/x86.inc.md:pre-built-images" --8<-- "docs/getting_started/installation/cpu.x86.inc.md:pre-built-images"
### Build image from source ### Build image from source
=== "Intel/AMD x86" === "Intel/AMD x86"
--8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-image-from-source" --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-image-from-source"
=== "ARM AArch64" === "ARM AArch64"
--8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-image-from-source" --8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-image-from-source"
=== "Apple silicon" === "Apple silicon"
--8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-image-from-source" --8<-- "docs/getting_started/installation/cpu.arm.inc.md:build-image-from-source"
=== "IBM Z (S390X)" === "IBM Z (S390X)"
--8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-image-from-source" --8<-- "docs/getting_started/installation/cpu.s390x.inc.md:build-image-from-source"
## Related runtime environment variables ## Related runtime environment variables
- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`. - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM to run more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists, `auto` (by default), or `nobind` (to disable binding to individual CPU cores and to inherit user-defined OpenMP variables). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. If set to `nobind`, the number of OpenMP threads is determined by the standard `OMP_NUM_THREADS` environment variable.
- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`. - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
- `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence. - `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
- `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False). - `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
...@@ -104,7 +104,7 @@ Currently, there are no pre-built CPU wheels. ...@@ -104,7 +104,7 @@ Currently, there are no pre-built CPU wheels.
### Which `dtype` should be used? ### Which `dtype` should be used?
- Currently vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem. - Currently, vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.
### How to launch a vLLM service on CPU? ### How to launch a vLLM service on CPU?
...@@ -128,7 +128,7 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe ...@@ -128,7 +128,7 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
### How to decide `VLLM_CPU_OMP_THREADS_BIND`? ### How to decide `VLLM_CPU_OMP_THREADS_BIND`?
- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to a same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If have any performance problems or unexpected binding behaviours, please try to bind threads as following. - Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following.
- On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: - On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
...@@ -156,12 +156,12 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe ...@@ -156,12 +156,12 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe
14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000
15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000
# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 # On this platform, it is recommended to only bind openMP threads on logical CPU cores 0-7 or 8-15
$ export VLLM_CPU_OMP_THREADS_BIND=0-7 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
$ python examples/offline_inference/basic/basic.py $ python examples/offline_inference/basic/basic.py
``` ```
- When deploy vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on a same NUMA node to avoid cross NUMA node memory access. - When deploying vLLM CPU backend on a multi-socket machine with NUMA and enable tensor parallel or pipeline parallel, each NUMA node is treated as a TP/PP rank. So be aware to set CPU cores of a single rank on the same NUMA node to avoid cross NUMA node memory access.
### How to decide `VLLM_CPU_KVCACHE_SPACE`? ### How to decide `VLLM_CPU_KVCACHE_SPACE`?
...@@ -171,7 +171,9 @@ This value is 4GB by default. Larger space can support more concurrent requests, ...@@ -171,7 +171,9 @@ This value is 4GB by default. Larger space can support more concurrent requests,
First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`. First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`.
Inference batch size is an important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM: Use multiples of 32 as `--block-size`, which is 128 by default.
Inference batch size is an important parameter for the performance. A larger batch usually provides higher throughput, a smaller batch provides lower latency. Tuning the max batch size starting from the default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM:
- `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as: - `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as:
- Offline Inference: `4096 * world_size` - Offline Inference: `4096 * world_size`
...@@ -192,8 +194,8 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel ...@@ -192,8 +194,8 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel
### (x86 only) What is the purpose of `VLLM_CPU_MOE_PREPACK` and `VLLM_CPU_SGL_KERNEL`? ### (x86 only) What is the purpose of `VLLM_CPU_MOE_PREPACK` and `VLLM_CPU_SGL_KERNEL`?
- Both of them require `amx` CPU flag. - Both of them require `amx` CPU flag.
- `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models - `VLLM_CPU_MOE_PREPACK` can provide better performance for MoE models
- `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios. - `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios.
### Why do I see `get_mempolicy: Operation not permitted` when running in Docker? ### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform. vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
Currently the CPU implementation for s390x architecture supports FP32 datatype only. Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
!!! warning !!! warning
There are no pre-built wheels or images for this device, so you must build vLLM from source. There are no pre-built wheels or images for this device, so you must build vLLM from source.
...@@ -46,22 +46,22 @@ Execute the following commands to build and install vLLM from source. ...@@ -46,22 +46,22 @@ Execute the following commands to build and install vLLM from source.
Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM. Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM.
```bash ```bash
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds sed -i '/^torch/d' requirements/build.txt # remove torch from requirements/build.txt since we use nightly builds
uv pip install -v \ uv pip install -v \
--torch-backend auto \ --torch-backend auto \
-r requirements-build.txt \ -r requirements/build.txt \
-r requirements-cpu.txt \ -r requirements/cpu.txt \
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \ VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
uv pip install dist/*.whl uv pip install dist/*.whl
``` ```
??? console "pip" ??? console "pip"
```bash ```bash
sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds sed -i '/^torch/d' requirements/build.txt # remove torch from requirements/build.txt since we use nightly builds
pip install -v \ pip install -v \
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \ --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
-r requirements-build.txt \ -r requirements/build.txt \
-r requirements-cpu.txt \ -r requirements/cpu.txt \
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \ VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
pip install dist/*.whl pip install dist/*.whl
``` ```
......
...@@ -83,7 +83,7 @@ uv pip install dist/*.whl ...@@ -83,7 +83,7 @@ uv pip install dist/*.whl
!!! example "Troubleshooting" !!! example "Troubleshooting"
- **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`. - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
- **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed. - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
- `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU. - `AMD` requires at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
- If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency. - If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
```toml title="pyproject.toml" ```toml title="pyproject.toml"
[build-system] [build-system]
......
First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
```bash
sudo apt-get update -y
sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
```
Second, clone the vLLM project:
```bash
git clone https://github.com/vllm-project/vllm.git vllm_source
cd vllm_source
```
Third, install required dependencies:
```bash
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
uv pip install -r requirements/cpu.txt --torch-backend cpu
```
??? console "pip"
```bash
pip install --upgrade pip
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
```
Finally, build and install vLLM:
```bash
VLLM_TARGET_DEVICE=cpu python setup.py install
```
If you want to develop vLLM, install it in editable mode instead.
```bash
VLLM_TARGET_DEVICE=cpu python setup.py develop
```
!!! note
If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
# --8<-- [end:extra-information]
# Google TPU
Tensor Processing Units (TPUs) are Google's custom-developed application-specific
integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
are available in different versions each with different hardware specifications.
For more information about TPUs, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm).
For more information on the TPU versions supported with vLLM, see:
- [TPU v6e](https://cloud.google.com/tpu/docs/v6e)
- [TPU v5e](https://cloud.google.com/tpu/docs/v5e)
- [TPU v5p](https://cloud.google.com/tpu/docs/v5p)
- [TPU v4](https://cloud.google.com/tpu/docs/v4)
These TPU versions allow you to configure the physical arrangements of the TPU
chips. This can improve throughput and networking performance. For more
information see:
- [TPU v6e topologies](https://cloud.google.com/tpu/docs/v6e#configurations)
- [TPU v5e topologies](https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config)
- [TPU v5p topologies](https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config)
- [TPU v4 topologies](https://cloud.google.com/tpu/docs/v4#tpu-v4-config)
In order for you to use Cloud TPUs you need to have TPU quota granted to your
Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
GPC project and are specified in terms of TPU version, the number of TPU you
want to use, and quota type. For more information, see [TPU quota](https://cloud.google.com/tpu/docs/quota#tpu_quota).
For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tpu/pricing).
You may need additional persistent storage for your TPU VMs. For more
information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options).
!!! warning
There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
## Requirements
- Google Cloud TPU VM
- TPU versions: v6e, v5e, v5p, v4
- Python: 3.11 or newer
### Provision Cloud TPUs
You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest)
or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources)
API (preferred). This section shows how to create TPUs using the queued resource API. For
more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api).
Queued resources enable you to request Cloud TPU resources in a queued manner.
When you request queued resources, the request is added to a queue maintained by
the Cloud TPU service. When the requested resource becomes available, it's
assigned to your Google Cloud project for your immediate exclusive use.
!!! note
In all of the following commands, replace the ALL CAPS parameter names with
appropriate values. See the parameter descriptions table for more information.
### Provision Cloud TPUs with GKE
For more information about using TPUs with GKE, see:
- [About TPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/tpus)
- [Deploy TPU workloads in GKE Standard](https://cloud.google.com/kubernetes-engine/docs/how-to/tpus)
- [Plan for TPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus)
## Configure a new environment
### Provision a Cloud TPU with the queued resource API
Create a TPU v5e with 4 TPU chips:
```bash
gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
--node-id TPU_NAME \
--project PROJECT_ID \
--zone ZONE \
--accelerator-type ACCELERATOR_TYPE \
--runtime-version RUNTIME_VERSION \
--service-account SERVICE_ACCOUNT
```
| Parameter name | Description |
|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| QUEUED_RESOURCE_ID | The user-assigned ID of the queued resource request. |
| TPU_NAME | The user-assigned name of the TPU which is created when the queued resource request is allocated. |
| PROJECT_ID | Your Google Cloud project |
| ZONE | The GCP zone where you want to create your Cloud TPU. The value you use depends on the version of TPUs you are using. For more information, see [TPU regions and zones] |
| ACCELERATOR_TYPE | The TPU version you want to use. Specify the TPU version, for example `v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information, see [TPU versions]. |
| RUNTIME_VERSION | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). |
| SERVICE_ACCOUNT | The email address for your service account. You can find it in the IAM Cloud Console under *Service Accounts*. For example: `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com` |
Connect to your TPU VM using SSH:
```bash
gcloud compute tpus tpu-vm ssh TPU_NAME --project PROJECT_ID --zone ZONE
```
!!! note
When configuring `RUNTIME_VERSION` ("TPU software version") on GCP, ensure it matches the TPU generation you've selected by referencing the [TPU VM images] compatibility matrix. Using an incompatible version may prevent vLLM from running correctly.
[TPU versions]: https://cloud.google.com/tpu/docs/runtimes
[TPU VM images]: https://cloud.google.com/tpu/docs/runtimes
[TPU regions and zones]: https://cloud.google.com/tpu/docs/regions-zones
## Set up using Python
### Pre-built wheels
Currently, there are no pre-built TPU wheels.
### Build wheel from source
Install Miniconda:
```bash
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
source ~/.bashrc
```
Create and activate a Conda environment for vLLM:
```bash
conda create -n vllm python=3.12 -y
conda activate vllm
```
Clone the vLLM repository and go to the vLLM directory:
```bash
git clone https://github.com/vllm-project/vllm.git && cd vllm
```
Uninstall the existing `torch` and `torch_xla` packages:
```bash
pip uninstall torch torch-xla -y
```
Install build dependencies:
```bash
pip install -r requirements/tpu.txt
sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev
```
Run the setup script:
```bash
VLLM_TARGET_DEVICE="tpu" python -m pip install -e .
```
## Set up using Docker
### Pre-built images
See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
### Build image from source
You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
```bash
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
```
Run the Docker image with the following command:
```bash
# Make sure to add `--privileged --net host --shm-size=16G`.
docker run --privileged --net host --shm-size=16G -it vllm-tpu
```
!!! note
Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
possible input shapes and compiles an XLA graph for each shape. The
compilation time may take 20~30 minutes in the first run. However, the
compilation time reduces to ~5 minutes afterwards because the XLA graphs are
cached in the disk (in `VLLM_XLA_CACHE_PATH` or `~/.cache/vllm/xla_cache` by default).
!!! tip
If you encounter the following error:
```console
from torch._C import * # noqa: F403
ImportError: libopenblas.so.0: cannot open shared object file: No such
file or directory
```
Install OpenBLAS with the following command:
```bash
sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev
```
...@@ -11,11 +11,11 @@ vLLM contains pre-compiled C++ and CUDA (12.8) binaries. ...@@ -11,11 +11,11 @@ vLLM contains pre-compiled C++ and CUDA (12.8) binaries.
# --8<-- [start:set-up-using-python] # --8<-- [start:set-up-using-python]
!!! note !!! note
PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details. PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <https://github.com/vllm-project/vllm/issues/8420> for more details.
In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below][build-from-source] for more details. Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-wheel-from-source) for more details.
# --8<-- [end:set-up-using-python] # --8<-- [end:set-up-using-python]
# --8<-- [start:pre-built-wheels] # --8<-- [start:pre-built-wheels]
...@@ -44,8 +44,6 @@ export CUDA_VERSION=118 # or 126 ...@@ -44,8 +44,6 @@ export CUDA_VERSION=118 # or 126
uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION} uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
``` ```
[](){ #install-the-latest-code }
#### Install the latest code #### Install the latest code
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`. LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
...@@ -128,11 +126,11 @@ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vll ...@@ -128,11 +126,11 @@ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vll
uv pip install --editable . uv pip install --editable .
``` ```
You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code]. You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
!!! note !!! note
There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [install-the-latest-code][install-the-latest-code] for instructions on how to install a specified wheel. It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
#### Full build (with compilation) #### Full build (with compilation)
...@@ -250,7 +248,7 @@ uv pip install -e . ...@@ -250,7 +248,7 @@ uv pip install -e .
# --8<-- [end:build-wheel-from-source] # --8<-- [end:build-wheel-from-source]
# --8<-- [start:pre-built-images] # --8<-- [start:pre-built-images]
See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image. See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image.
Another way to access the latest code is to use the docker images: Another way to access the latest code is to use the docker images:
...@@ -266,11 +264,11 @@ The latest code can contain bugs and may not be stable. Please use it with cauti ...@@ -266,11 +264,11 @@ The latest code can contain bugs and may not be stable. Please use it with cauti
# --8<-- [end:pre-built-images] # --8<-- [end:pre-built-images]
# --8<-- [start:build-image-from-source] # --8<-- [start:build-image-from-source]
See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image. See [Building vLLM's Docker Image from Source](../../deployment/docker.md#building-vllms-docker-image-from-source) for instructions on building the Docker image.
# --8<-- [end:build-image-from-source] # --8<-- [end:build-image-from-source]
# --8<-- [start:supported-features] # --8<-- [start:supported-features]
See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information. See [Feature x Hardware](../../features/README.md#feature-x-hardware) compatibility matrix for feature support information.
# --8<-- [end:supported-features] # --8<-- [end:supported-features]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment