Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ef9676a1
Unverified
Commit
ef9676a1
authored
Oct 14, 2025
by
Cyrus Leung
Committed by
GitHub
Oct 14, 2025
Browse files
[Doc] ruff format some Python examples (#26767)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
70b1b330
Changes
20
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
341 additions
and
290 deletions
+341
-290
docs/configuration/conserving_memory.md
docs/configuration/conserving_memory.md
+23
-21
docs/configuration/optimization.md
docs/configuration/optimization.md
+15
-9
docs/contributing/model/basic.md
docs/contributing/model/basic.md
+2
-2
docs/contributing/model/multimodal.md
docs/contributing/model/multimodal.md
+23
-19
docs/contributing/model/registration.md
docs/contributing/model/registration.md
+1
-1
docs/contributing/model/transcription.md
docs/contributing/model/transcription.md
+11
-1
docs/deployment/frameworks/cerebrium.md
docs/deployment/frameworks/cerebrium.md
+2
-2
docs/deployment/frameworks/dstack.md
docs/deployment/frameworks/dstack.md
+2
-2
docs/deployment/frameworks/haystack.md
docs/deployment/frameworks/haystack.md
+1
-1
docs/deployment/frameworks/hf_inference_endpoints.md
docs/deployment/frameworks/hf_inference_endpoints.md
+18
-18
docs/deployment/frameworks/litellm.md
docs/deployment/frameworks/litellm.md
+7
-6
docs/deployment/frameworks/retrieval_augmented_generation.md
docs/deployment/frameworks/retrieval_augmented_generation.md
+2
-2
docs/design/cuda_graphs.md
docs/design/cuda_graphs.md
+9
-7
docs/design/io_processor_plugins.md
docs/design/io_processor_plugins.md
+15
-12
docs/design/metrics.md
docs/design/metrics.md
+7
-5
docs/design/prefix_caching.md
docs/design/prefix_caching.md
+2
-2
docs/features/lora.md
docs/features/lora.md
+4
-7
docs/features/multimodal_inputs.md
docs/features/multimodal_inputs.md
+147
-130
docs/features/reasoning_outputs.md
docs/features/reasoning_outputs.md
+31
-25
docs/features/tool_calling.md
docs/features/tool_calling.md
+19
-18
No files found.
docs/configuration/conserving_memory.md
View file @
ef9676a1
...
...
@@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs.
```
python
from
vllm
import
LLM
llm
=
LLM
(
model
=
"ibm-granite/granite-3.1-8b-instruct"
,
tensor_parallel_size
=
2
)
llm
=
LLM
(
model
=
"ibm-granite/granite-3.1-8b-instruct"
,
tensor_parallel_size
=
2
)
```
!!! warning
...
...
@@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option).
```
python
from
vllm
import
LLM
llm
=
LLM
(
model
=
"adept/fuyu-8b"
,
max_model_len
=
2048
,
max_num_seqs
=
2
)
llm
=
LLM
(
model
=
"adept/fuyu-8b"
,
max_model_len
=
2048
,
max_num_seqs
=
2
)
```
## Reduce CUDA Graphs
...
...
@@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag:
```
python
from
vllm
import
LLM
llm
=
LLM
(
model
=
"meta-llama/Llama-3.1-8B-Instruct"
,
enforce_eager
=
True
)
llm
=
LLM
(
model
=
"meta-llama/Llama-3.1-8B-Instruct"
,
enforce_eager
=
True
)
```
## Adjust cache size
...
...
@@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
from
vllm
import
LLM
# Accept up to 3 images and 1 video per prompt
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
limit_mm_per_prompt
=
{
"image"
:
3
,
"video"
:
1
})
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
limit_mm_per_prompt
=
{
"image"
:
3
,
"video"
:
1
},
)
```
You can go a step further and disable unused modalities completely by setting its limit to zero.
...
...
@@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a
from
vllm
import
LLM
# Accept any number of images but no videos
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
limit_mm_per_prompt
=
{
"video"
:
0
})
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
limit_mm_per_prompt
=
{
"video"
:
0
},
)
```
You can even run a multi-modal model for text-only inference:
...
...
@@ -118,8 +118,10 @@ You can even run a multi-modal model for text-only inference:
from
vllm
import
LLM
# Don't accept images. Just text.
llm
=
LLM
(
model
=
"google/gemma-3-27b-it"
,
limit_mm_per_prompt
=
{
"image"
:
0
})
llm
=
LLM
(
model
=
"google/gemma-3-27b-it"
,
limit_mm_per_prompt
=
{
"image"
:
0
},
)
```
### Configurable options
...
...
@@ -173,14 +175,14 @@ Here are some examples:
from
vllm
import
LLM
# Available for Qwen2-VL series models
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
mm_processor_kwargs
=
{
"max_pixels"
:
768
*
768
,
# Default is 1280 * 28 * 28
}
)
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
mm_processor_kwargs
=
{
"max_pixels"
:
768
*
768
}
,
# Default is 1280 * 28 * 28
)
# Available for InternVL series models
llm
=
LLM
(
model
=
"OpenGVLab/InternVL2-2B"
,
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
4
,
# Default is 12
}
)
llm
=
LLM
(
model
=
"OpenGVLab/InternVL2-2B"
,
mm_processor_kwargs
=
{
"max_dynamic_patch"
:
4
}
,
# Default is 12
)
```
docs/configuration/optimization.md
View file @
ef9676a1
...
...
@@ -100,7 +100,7 @@ from vllm import LLM
llm
=
LLM
(
model
=
"meta-llama/Llama-3.3-70B-Instruct,
tensor_parallel_size=4,
pipeline_parallel_size=2
pipeline_parallel_size=2
,
)
```
...
...
@@ -257,18 +257,24 @@ Examples:
```
python
# Use a larger cache
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
mm_processor_cache_gb
=
8
)
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
mm_processor_cache_gb
=
8
,
)
# Use a shared-memory based IPC cache
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
tensor_parallel_size
=
2
,
mm_processor_cache_type
=
"shm"
,
mm_processor_cache_gb
=
8
)
mm_processor_cache_gb
=
8
,
)
# Disable the cache
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
mm_processor_cache_gb
=
0
)
llm
=
LLM
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
mm_processor_cache_gb
=
0
,
)
```
### Cache Placement
...
...
docs/contributing/model/basic.md
View file @
ef9676a1
...
...
@@ -73,8 +73,8 @@ def forward(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
:
...
```
...
...
docs/contributing/model/multimodal.md
View file @
ef9676a1
...
...
@@ -16,7 +16,7 @@ Further update the model as follows:
...
@classmethod
def get_placeholder_str(cls, modality: str, i: int) ->
Optional[str]
:
def get_placeholder_str(cls, modality: str, i: int) ->
str | None
:
if modality.startswith("image"):
return "<image>"
...
...
@@ -45,14 +45,14 @@ Further update the model as follows:
...
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
assert self.vision_encoder is not None
image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features)
def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
self,
**kwargs: object,
) -> MultiModalEmbeddings | None:
# Validate the multimodal input keyword arguments
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
...
...
@@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
For example, if the model supports any number of images but only one video per prompt:
```
python
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]
]:
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
int
|
None
]:
return
{
"image"
:
None
,
"video"
:
1
}
```
...
...
@@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
self,
seq_len: int,
mm_counts: Mapping[str, int],
mm_options:
Optional[
Mapping[str, BaseDummyOptions]
]
= None,
mm_options: Mapping[str, BaseDummyOptions]
| None
= None,
) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0)
...
...
@@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
```python
def get_image_size_with_most_features(self) -> ImageSize:
image_processor = self.get_image_processor()
return ImageSize(width=image_processor.size["width"],
height=image_processor.size["height"])
return ImageSize(
width=image_processor.size["width"],
height=image_processor.size["height"],
)
```
Fuyu does not expect image placeholders in the inputs to HF processor, so
...
...
@@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
return {
"image":
self._get_dummy_images(width=target_width,
self._get_dummy_images(
width=target_width,
height=target_height,
num_images=num_images,
overrides=image_overrides)
overrides=image_overrides,
)
}
```
...
...
@@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_width=image_size.width,
image_height=image_size.height,
)
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id],
...
...
@@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_width=image_size.width,
image_height=image_size.height,
)
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id],
...
...
@@ -810,9 +812,11 @@ to register them to the multi-modal registry:
from vllm.model_executor.models.interfaces import SupportsMultiModal
+ from vllm.multimodal import MULTIMODAL_REGISTRY
+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
+ @MULTIMODAL_REGISTRY.register_processor(
+ YourMultiModalProcessor,
+ info=YourProcessingInfo,
+ dummy_inputs=YourDummyInputsBuilder)
+ dummy_inputs=YourDummyInputsBuilder,
+ )
class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
```
...
...
docs/contributing/model/registration.md
View file @
ef9676a1
...
...
@@ -42,7 +42,7 @@ def register():
ModelRegistry
.
register_model
(
"YourModelForCausalLM"
,
"your_code:YourModelForCausalLM"
"your_code:YourModelForCausalLM"
,
)
```
...
...
docs/contributing/model/transcription.md
View file @
ef9676a1
...
...
@@ -15,6 +15,7 @@ Declare supported languages and capabilities:
-
Set
`supports_transcription_only=True`
if the model should not serve text generation (eg Whisper).
??? code "supported_languages and supports_transcription_only"
```python
from typing import ClassVar, Mapping, Literal
import numpy as np
...
...
@@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
This is for controlling general behavior of the API when serving your model:
??? code "get_speech_to_text_config()"
```python
class YourASRModel(nn.Module, SupportsTranscription):
...
...
...
@@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
Return a dict containing
`multi_modal_data`
with the audio, and either a
`prompt`
string or
`prompt_token_ids`
:
??? code "get_generation_prompt()"
```python
class YourASRModel(nn.Module, SupportsTranscription):
...
...
...
@@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
Return a dict with separate
`encoder_prompt`
and
`decoder_prompt`
entries:
??? code "get_generation_prompt()"
```python
class YourASRModel(nn.Module, SupportsTranscription):
...
...
...
@@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface
If your model requires a language and you want a default, override this method (see Whisper):
??? code "validate_language()"
```python
@classmethod
def validate_language(cls, language: str | None) -> str | None:
if language is None:
logger.warning(
"Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field."
)
"Defaulting to language='en'. If you wish to transcribe "
"audio in a different language, pass the `language` field "
"in the TranscriptionRequest."
)
language = "en"
return super().validate_language(language)
```
...
...
@@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
Provide a fast duration→token estimate to improve streaming usage statistics:
??? code "get_num_audio_tokens()"
```python
class YourASRModel(nn.Module, SupportsTranscription):
...
...
...
@@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi
Relevant server logic:
??? code "_preprocess_speech_to_text()"
```python
# vllm/entrypoints/openai/speech_to_text.py
async def _preprocess_speech_to_text(...):
...
...
docs/deployment/frameworks/cerebrium.md
View file @
ef9676a1
...
...
@@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
??? console "Command"
```
python
```
bash
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
-H 'Content-Type: application/json' \
-H 'Authorization: <JWT TOKEN>' \
...
...
@@ -81,7 +81,7 @@ You should get a response like:
??? console "Response"
```
pyth
on
```
js
on
{
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
"result": {
...
...
docs/deployment/frameworks/dstack.md
View file @
ef9676a1
...
...
@@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
client = OpenAI(
base_url="https://gateway.<gateway domain>",
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>"
,
)
completion = client.chat.completions.create(
...
...
@@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
"role": "user",
"content": "Compose a poem that explains the concept of recursion in programming.",
}
]
]
,
)
print(completion.choices[0].message.content)
...
...
docs/deployment/frameworks/haystack.md
View file @
ef9676a1
...
...
@@ -34,7 +34,7 @@ pip install vllm haystack-ai
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
model="mistralai/Mistral-7B-Instruct-v0.1",
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
generation_kwargs
=
{"max_tokens": 512}
generation_kwargs
=
{"max_tokens": 512}
,
)
response = generator.run(
...
...
docs/deployment/frameworks/hf_inference_endpoints.md
View file @
ef9676a1
...
...
@@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
import os
client = OpenAI(
base_url
=
DEPLOYMENT_URL,
api_key
=
os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
base_url
=
DEPLOYMENT_URL,
api_key
=
os.environ["HF_TOKEN"]
,
# https://huggingface.co/settings/tokens
)
chat_completion = client.chat.completions.create(
model
=
"HuggingFaceTB/SmolLM3-3B",
messages
=
[
model
=
"HuggingFaceTB/SmolLM3-3B",
messages
=
[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Give me a brief explanation of gravity in simple terms."
"text": "Give me a brief explanation of gravity in simple terms."
,
}
]
]
,
}
],
stream
=
True
stream
=
True
,
)
for message in chat_completion:
print(message.choices[0].delta.content, end
=
"")
print(message.choices[0].delta.content, end
=
"")
```
!!! note
...
...
@@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg
import os
client = OpenAI(
base_url
=
DEPLOYMENT_URL,
api_key
=
os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens
base_url
=
DEPLOYMENT_URL,
api_key
=
os.environ["HF_TOKEN"]
,
# https://huggingface.co/settings/tokens
)
chat_completion = client.chat.completions.create(
model
=
"ibm-granite/granite-docling-258M",
messages
=
[
model
=
"ibm-granite/granite-docling-258M",
messages
=
[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
}
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png"
,
}
,
},
{
"type": "text",
"text": "Convert this page to docling."
}
"text": "Convert this page to docling."
,
}
,
]
}
],
stream
=
True
stream
=
True
,
)
for message in chat_completion:
print(message.choices[0].delta.content, end
=
"")
print(message.choices[0].delta.content, end
=
"")
```
!!! note
...
...
docs/deployment/frameworks/litellm.md
View file @
ef9676a1
...
...
@@ -36,7 +36,7 @@ pip install vllm litellm
```python
import litellm
messages = [{
"content": "Hello, how are you?","role": "user"}]
messages = [{"content": "Hello, how are you?",
"role": "user"}]
# hosted_vllm is prefix key word and necessary
response = litellm.completion(
...
...
@@ -44,7 +44,8 @@ pip install vllm litellm
messages=messages,
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
temperature=0.2,
max_tokens=80)
max_tokens=80,
)
print(response)
```
...
...
docs/deployment/frameworks/retrieval_augmented_generation.md
View file @
ef9676a1
...
...
@@ -40,7 +40,7 @@ pip install -U vllm \
1.
Run the script
```
python
```
bash
python retrieval_augmented_generation_with_langchain.py
```
...
...
@@ -78,6 +78,6 @@ pip install vllm \
1.
Run the script:
```
python
```
bash
python retrieval_augmented_generation_with_llamaindex.py
```
docs/design/cuda_graphs.md
View file @
ef9676a1
...
...
@@ -106,9 +106,11 @@ The dispatch code looks like:
batch_descriptor
=
BatchDescriptor
(
num_tokens
=
num_input_tokens
,
uniform_decode
=
...)
runtime_mode
,
batch_descriptor
=
cudagraphdispatcher
.
dispatch
(
batch_descriptor
)
# execution
with
set_forward_context
(...,
with
set_forward_context
(
...,
cudagraph_runtime_mode
=
runtime_mode
,
batch_descriptor
=
batch_descriptor
):
batch_descriptor
=
batch_descriptor
,
):
output
=
self
.
model
(...)
```
...
...
@@ -203,9 +205,9 @@ from vllm.config import CUDAGraphMode
compilation_config
=
{
"level"
:
3
,
"cudagraph_mode"
:
"FULL_AND_PIECEWISE"
}
model
=
vllm
.
LLM
(
model
=
"meta-llama/Llama-3.1-8B-Instruct"
,
dtype
=
'
auto
'
,
compilation_config
=
compilation_config
,
)
dtype
=
"
auto
"
,
compilation_config
=
compilation_config
,
)
sampling_params
=
vllm
.
SamplingParams
(
temperature
=
0
,
# greedy decoding
max_tokens
=
1024
,
...
...
docs/design/io_processor_plugins.md
View file @
ef9676a1
...
...
@@ -9,8 +9,8 @@ When performing an inference with IO Processor plugins, the prompt type is defin
IO Processor plugins implement the
`IOProcessor`
interface (
<gh-file:vllm
/
plugins
/
io_processors
/
interface.py
>
):
```
python
IOProcessorInput
=
TypeVar
(
'
IOProcessorInput
'
)
IOProcessorOutput
=
TypeVar
(
'
IOProcessorOutput
'
)
IOProcessorInput
=
TypeVar
(
"
IOProcessorInput
"
)
IOProcessorOutput
=
TypeVar
(
"
IOProcessorOutput
"
)
class
IOProcessor
(
ABC
,
Generic
[
IOProcessorInput
,
IOProcessorOutput
]):
...
...
@@ -21,30 +21,32 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
def
pre_process
(
self
,
prompt
:
IOProcessorInput
,
request_id
:
Optional
[
str
]
=
None
,
request_id
:
str
|
None
=
None
,
**
kwargs
,
)
->
Union
[
PromptType
,
Sequence
[
PromptType
]
]
:
)
->
PromptType
|
Sequence
[
PromptType
]:
raise
NotImplementedError
async
def
pre_process_async
(
self
,
prompt
:
IOProcessorInput
,
request_id
:
Optional
[
str
]
=
None
,
request_id
:
str
|
None
=
None
,
**
kwargs
,
)
->
Union
[
PromptType
,
Sequence
[
PromptType
]
]
:
)
->
PromptType
|
Sequence
[
PromptType
]:
return
self
.
pre_process
(
prompt
,
request_id
,
**
kwargs
)
@
abstractmethod
def
post_process
(
self
,
def
post_process
(
self
,
model_output
:
Sequence
[
PoolingRequestOutput
],
request_id
:
Optional
[
str
]
=
None
,
**
kwargs
)
->
IOProcessorOutput
:
request_id
:
str
|
None
=
None
,
**
kwargs
,
)
->
IOProcessorOutput
:
raise
NotImplementedError
async
def
post_process_async
(
self
,
model_output
:
AsyncGenerator
[
tuple
[
int
,
PoolingRequestOutput
]],
request_id
:
Optional
[
str
]
=
None
,
request_id
:
str
|
None
=
None
,
**
kwargs
,
)
->
IOProcessorOutput
:
collected_output
=
[
item
async
for
i
,
item
in
model_output
]
...
...
@@ -56,7 +58,8 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
@
abstractmethod
def
output_to_response
(
self
,
plugin_output
:
IOProcessorOutput
)
->
IOProcessorResponse
:
self
,
plugin_output
:
IOProcessorOutput
)
->
IOProcessorResponse
:
raise
NotImplementedError
```
...
...
docs/design/metrics.md
View file @
ef9676a1
...
...
@@ -478,15 +478,17 @@ us with:
```python
if seq_group.is_finished():
if (seq_group.metrics.first_scheduled_time is not None and
seq_group.metrics.first_token_time is not None):
if (
seq_group.metrics.first_scheduled_time is not None
and seq_group.metrics.first_token_time is not None
):
time_queue_requests.append(
seq_group.metrics.first_scheduled_time -
seq_group.metrics.arrival_time)
seq_group.metrics.arrival_time
)
...
if seq_group.metrics.time_in_queue is not None:
time_in_queue_requests.append(
seq_group.metrics.time_in_queue)
time_in_queue_requests.append(seq_group.metrics.time_in_queue)
```
This seems duplicative, and one of them should be removed. The latter
...
...
docs/design/prefix_caching.md
View file @
ef9676a1
...
...
@@ -112,8 +112,8 @@ class KVCacheBlock:
ref_cnt
:
int
# The pointers to form a doubly linked list for the free queue.
prev_free_block
:
Optional
[
"KVCacheBlock"
]
=
None
next_free_block
:
Optional
[
"KVCacheBlock"
]
=
None
prev_free_block
:
"KVCacheBlock
| None
"
=
None
next_free_block
:
"KVCacheBlock
| None
"
=
None
```
There are two design points to highlight:
...
...
docs/features/lora.md
View file @
ef9676a1
...
...
@@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter.
sampling_params = SamplingParams(
temperature=0,
max_tokens=256,
stop=["[/assistant]"]
stop=["[/assistant]"]
,
)
prompts = [
...
...
@@ -43,7 +43,7 @@ the third parameter is the path to the LoRA adapter.
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
,
)
```
...
...
@@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin:
lora_request = LoRARequest(
lora_name=lora_name,
lora_path=local_path,
lora_int_id=abs(hash(lora_name))
lora_int_id=abs(hash(lora_name))
,
)
return lora_request
```
...
...
@@ -296,10 +296,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
if has_audio:
question = f"<|audio|>{question}"
chat = [
{
"role": "user",
"content": question
}
{"role": "user", "content": question},
]
return tokenizer.apply_chat_template(chat, tokenize=False)
...
...
docs/features/multimodal_inputs.md
View file @
ef9676a1
...
...
@@ -154,9 +154,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": [image1, image2]
},
"multi_modal_data": {"image": [image1, image2]},
})
for o in outputs:
...
...
@@ -183,21 +181,24 @@ conversation = [
{
"role"
:
"assistant"
,
"content"
:
"Hello! How can I assist you today?"
},
{
"role"
:
"user"
,
"content"
:
[{
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
},{
"image_url"
:
{
"url"
:
image_url
},
},
{
"type"
:
"image_pil"
,
"image_pil"
:
image_pil
},
{
"image_pil"
:
image_pil
,
},
{
"type"
:
"image_embeds"
,
"image_embeds"
:
image_embeds
},
{
"image_embeds"
:
image_embeds
,
},
{
"type"
:
"text"
,
"text"
:
"What's in these images?"
}],
"text"
:
"What's in these images?"
,
},
],
},
]
...
...
@@ -224,7 +225,10 @@ Multi-image input can be extended to perform video captioning. We show this with
message = {
"role": "user",
"content": [
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
{
"type": "text",
"text": "Describe this set of frames. Consider the frames to be a part of the same video.",
},
],
}
for i in range(len(video_frames)):
...
...
@@ -255,13 +259,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
# Custom black background for dark theme
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
,
)
# Custom brand color background (e.g., blue)
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
,
)
```
...
...
@@ -294,20 +298,23 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
limit_mm_per_prompt={"video": 1},
)
sampling_params = SamplingParams(
max_tokens=1024,
)
sampling_params = SamplingParams(max_tokens=1024)
video_messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{
"role": "system",
"content": "You are a helpful assistant.",
},
{
"role": "user",
"content": [
{"type": "text", "text": "describe this video."},
{
"type": "video",
"video": video_path,
"total_pixels": 20480 * 28 * 28,
"min_pixels": 16 * 28 * 28
}
"min_pixels": 16 * 28 * 28
,
}
,
]
},
]
...
...
@@ -465,21 +472,24 @@ Then, you can use the OpenAI client as follows:
chat_response = client.chat.completions.create(
model="microsoft/Phi-3.5-vision-instruct",
messages=[{
messages=[
{
"role": "user",
"content": [
# NOTE: The prompt formatting with the image token `<image>` is not needed
# since the prompt will be processed automatically by the API server.
{"type": "text", "text": "What’s in this image?"},
{
"type": "image_url",
"image_url": {
url": image_url
"type": "text",
"text": "What’s in this image?",
},
"uuid": image_url # Optional
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_url, # Optional
},
],
}],
}
],
)
print("Chat completion output:", chat_response.choices[0].message.content)
...
...
@@ -489,26 +499,27 @@ Then, you can use the OpenAI client as follows:
chat_response = client.chat.completions.create(
model="microsoft/Phi-3.5-vision-instruct",
messages=[{
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What are the animals in these images?"},
{
"type": "image_url",
"image_url": {
"url": image_url_duck
},
"uuid": image_url_duck # Optional
"type": "text",
"text": "What are the animals in these images?",
},
{
"type": "image_url",
"image_url": {
"u
rl
": image_url_
l
ion
"image_url": {
"url": image_url_duck},
"u
uid
": image_url_
duck, # Opt
ion
al
},
"uuid": image_url_lion # Optional
{
"type": "image_url",
"image_url": {"url": image_url_lion},
"uuid": image_url_lion, # Optional
},
],
}],
}
],
)
print("Chat completion output:", chat_response.choices[0].message.content)
```
...
...
@@ -560,23 +571,22 @@ Then, you can use the OpenAI client as follows:
## Use video url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[
{
"role":
"user",
messages=[
{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this video?"
"text": "What's in this video?"
,
},
{
"type": "video_url",
"video_url": {
"url": video_url
},
"uuid": video_url # Optional
"video_url": {"url": video_url},
"uuid": video_url, # Optional
},
],
}],
}
],
model=model,
max_completion_tokens=64,
)
...
...
@@ -652,23 +662,25 @@ Then, you can use the OpenAI client as follows:
audio_base64 = encode_base64_content_from_url(audio_url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
"text": "What's in this audio?"
,
},
{
"type": "input_audio",
"input_audio": {
"data": audio_base64,
"format": "wav"
"format": "wav"
,
},
"uuid": audio_url # Optional
"uuid": audio_url, # Optional
},
],
},
],
}],
model=model,
max_completion_tokens=64,
)
...
...
@@ -683,22 +695,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
```python
chat_completion_from_url = client.chat.completions.create(
messages=[{
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
"text": "What's in this audio?"
,
},
{
"type": "audio_url",
"audio_url": {
"url": audio_url
},
"uuid": audio_url # Optional
"audio_url": {"url": audio_url},
"uuid": audio_url, # Optional
},
],
}],
}
],
model=model,
max_completion_tokens=64,
)
...
...
@@ -750,7 +762,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
embeds = {
"type": "image_embeds",
"image_embeds": f"{base64_image_embedding}",
"uuid": image_url # Optional
"uuid": image_url
,
# Optional
}
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
...
...
@@ -758,24 +770,29 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
embeds = {
"type": "image_embeds",
"image_embeds": {
"image_embeds": f"{base64_image_embedding}"
, # Required
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
"image_embeds": f"{base64_image_embedding}",
# Required
"image_grid_thw": f"{base64_image_grid_thw}"
,
# Required by Qwen/Qwen2-VL-2B-Instruct
},
"uuid": image_url # Optional
"uuid": image_url
,
# Optional
}
model = "openbmb/MiniCPM-V-2_6"
embeds = {
"type": "image_embeds",
"image_embeds": {
"image_embeds": f"{base64_image_embedding}"
, # Required
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
"image_embeds": f"{base64_image_embedding}",
# Required
"image_sizes": f"{base64_image_sizes}"
,
# Required by openbmb/MiniCPM-V-2_6
},
"uuid": image_url # Optional
"uuid": image_url
,
# Optional
}
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{
"role": "system",
"content": "You are a helpful assistant.",
},
{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this image?",
...
...
@@ -802,22 +819,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
{
"type": "image_embeds",
"image_embeds": None,
"uuid": image_uuid
"uuid": image_uuid
,
},
# input_audio:
{
"type": "input_audio",
"input_audio": None,
"uuid": audio_uuid
"uuid": audio_uuid
,
},
# PIL Image:
{
"type": "image_pil",
"image_pil": None
"uuid": image_uuid
}
"image_pil": None
,
"uuid": image_uuid
,
}
,
```
...
...
docs/features/reasoning_outputs.md
View file @
ef9676a1
...
...
@@ -117,9 +117,11 @@ OpenAI Python client library does not officially support `reasoning_content` att
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
# For Qwen3 series, if you want to disable thinking in reasoning mode, add:
# extra_body={"chat_template_kwargs": {"enable_thinking": False}}
stream = client.chat.completions.create(model=model,
stream = client.chat.completions.create(
model=model,
messages=messages,
stream=True)
stream=True,
)
print("client: Start streaming chat completions...")
printed_reasoning_content = False
...
...
@@ -159,7 +161,8 @@ The reasoning content is also available when both tool calling and the reasoning
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
tools = [{
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
...
...
@@ -168,18 +171,19 @@ The reasoning content is also available when both tool calling and the reasoning
"type": "object",
"properties": {
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
,
},
"required": ["location", "unit"]
"required": ["location", "unit"]
,
}
},
}
}
]
]
response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools,
tool_choice="auto"
tool_choice="auto"
,
)
print(response)
...
...
@@ -225,7 +229,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) ->
Union[
DeltaMessage
,
None
]
:
) -> DeltaMessage
|
None:
"""
Instance method that should be implemented for extracting reasoning
from an incomplete response; for use when handling reasoning calls and
...
...
@@ -235,8 +239,10 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
"""
def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[Optional[str], Optional[str]]:
self,
model_output: str,
request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]:
"""
Extract reasoning content from a complete model-generated string.
...
...
@@ -274,10 +280,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
@classmethod
def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
return cls(
start_token_id=tokenizer.encode(
"<think>", add_special_tokens=False)[0],
end_token_id=tokenizer.encode("</think>",
add_special_tokens=False)[0]
)
return cls(
start_token_id=tokenizer.encode(
"<think>", add_special_tokens=False)[0],
end_token_id=tokenizer.encode("</think>",
add_special_tokens=False)[0],
)
def is_reasoning_end(self, input_ids: list[int]) -> bool:
return self.end_token_id in input_ids
...
...
docs/features/tool_calling.md
View file @
ef9676a1
...
...
@@ -27,7 +27,8 @@ Next, make a request that triggers the model to use the available tools:
return f"Getting the weather for {location} in {unit}..."
tool_functions = {"get_weather": get_weather}
tools = [{
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
...
...
@@ -38,16 +39,17 @@ Next, make a request that triggers the model to use the available tools:
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["location", "unit"]
}
}
}]
"required": ["location", "unit"],
},
},
},
]
response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools,
tool_choice="auto"
tool_choice="auto"
,
)
tool_call = response.choices[0].message.tool_calls[0].function
...
...
@@ -402,8 +404,7 @@ Here is a summary of a plugin file:
# adjust request. e.g.: set skip special tokens
# to False for tool call output.
def adjust_request(
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
return request
# implement the tool call parse for stream call
...
...
@@ -416,7 +417,7 @@ Here is a summary of a plugin file:
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: ChatCompletionRequest,
) ->
Union[
DeltaMessage
,
None
]
:
) -> DeltaMessage
|
None:
return delta
# implement the tool parse for non-stream call
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment