Unverified Commit ef9676a1 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Doc] ruff format some Python examples (#26767)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 70b1b330
...@@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs. ...@@ -11,8 +11,7 @@ The following code splits the model across 2 GPUs.
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
tensor_parallel_size=2)
``` ```
!!! warning !!! warning
...@@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option). ...@@ -43,9 +42,7 @@ and the maximum batch size (`max_num_seqs` option).
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM(model="adept/fuyu-8b", llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
max_model_len=2048,
max_num_seqs=2)
``` ```
## Reduce CUDA Graphs ## Reduce CUDA Graphs
...@@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag: ...@@ -78,8 +75,7 @@ You can disable graph capturing completely via the `enforce_eager` flag:
```python ```python
from vllm import LLM from vllm import LLM
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enforce_eager=True)
enforce_eager=True)
``` ```
## Adjust cache size ## Adjust cache size
...@@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem ...@@ -97,8 +93,10 @@ You can allow a smaller number of multi-modal items per prompt to reduce the mem
from vllm import LLM from vllm import LLM
# Accept up to 3 images and 1 video per prompt # Accept up to 3 images and 1 video per prompt
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", llm = LLM(
limit_mm_per_prompt={"image": 3, "video": 1}) model="Qwen/Qwen2.5-VL-3B-Instruct",
limit_mm_per_prompt={"image": 3, "video": 1},
)
``` ```
You can go a step further and disable unused modalities completely by setting its limit to zero. You can go a step further and disable unused modalities completely by setting its limit to zero.
...@@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a ...@@ -108,8 +106,10 @@ For example, if your application only accepts image input, there is no need to a
from vllm import LLM from vllm import LLM
# Accept any number of images but no videos # Accept any number of images but no videos
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", llm = LLM(
limit_mm_per_prompt={"video": 0}) model="Qwen/Qwen2.5-VL-3B-Instruct",
limit_mm_per_prompt={"video": 0},
)
``` ```
You can even run a multi-modal model for text-only inference: You can even run a multi-modal model for text-only inference:
...@@ -118,8 +118,10 @@ You can even run a multi-modal model for text-only inference: ...@@ -118,8 +118,10 @@ You can even run a multi-modal model for text-only inference:
from vllm import LLM from vllm import LLM
# Don't accept images. Just text. # Don't accept images. Just text.
llm = LLM(model="google/gemma-3-27b-it", llm = LLM(
limit_mm_per_prompt={"image": 0}) model="google/gemma-3-27b-it",
limit_mm_per_prompt={"image": 0},
)
``` ```
### Configurable options ### Configurable options
...@@ -173,14 +175,14 @@ Here are some examples: ...@@ -173,14 +175,14 @@ Here are some examples:
from vllm import LLM from vllm import LLM
# Available for Qwen2-VL series models # Available for Qwen2-VL series models
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", llm = LLM(
mm_processor_kwargs={ model="Qwen/Qwen2.5-VL-3B-Instruct",
"max_pixels": 768 * 768, # Default is 1280 * 28 * 28 mm_processor_kwargs={"max_pixels": 768 * 768}, # Default is 1280 * 28 * 28
}) )
# Available for InternVL series models # Available for InternVL series models
llm = LLM(model="OpenGVLab/InternVL2-2B", llm = LLM(
mm_processor_kwargs={ model="OpenGVLab/InternVL2-2B",
"max_dynamic_patch": 4, # Default is 12 mm_processor_kwargs={"max_dynamic_patch": 4}, # Default is 12
}) )
``` ```
...@@ -100,7 +100,7 @@ from vllm import LLM ...@@ -100,7 +100,7 @@ from vllm import LLM
llm = LLM( llm = LLM(
model="meta-llama/Llama-3.3-70B-Instruct, model="meta-llama/Llama-3.3-70B-Instruct,
tensor_parallel_size=4, tensor_parallel_size=4,
pipeline_parallel_size=2 pipeline_parallel_size=2,
) )
``` ```
...@@ -257,18 +257,24 @@ Examples: ...@@ -257,18 +257,24 @@ Examples:
```python ```python
# Use a larger cache # Use a larger cache
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", llm = LLM(
mm_processor_cache_gb=8) model="Qwen/Qwen2.5-VL-3B-Instruct",
mm_processor_cache_gb=8,
)
# Use a shared-memory based IPC cache # Use a shared-memory based IPC cache
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", llm = LLM(
model="Qwen/Qwen2.5-VL-3B-Instruct",
tensor_parallel_size=2, tensor_parallel_size=2,
mm_processor_cache_type="shm", mm_processor_cache_type="shm",
mm_processor_cache_gb=8) mm_processor_cache_gb=8,
)
# Disable the cache # Disable the cache
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct", llm = LLM(
mm_processor_cache_gb=0) model="Qwen/Qwen2.5-VL-3B-Instruct",
mm_processor_cache_gb=0,
)
``` ```
### Cache Placement ### Cache Placement
......
...@@ -73,8 +73,8 @@ def forward( ...@@ -73,8 +73,8 @@ def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: Optional[torch.Tensor] = None, inputs_embeds: torch.Tensor | None = None,
) -> torch.Tensor: ) -> torch.Tensor:
... ...
``` ```
......
...@@ -16,7 +16,7 @@ Further update the model as follows: ...@@ -16,7 +16,7 @@ Further update the model as follows:
... ...
@classmethod @classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: def get_placeholder_str(cls, modality: str, i: int) -> str | None:
if modality.startswith("image"): if modality.startswith("image"):
return "<image>" return "<image>"
...@@ -45,14 +45,14 @@ Further update the model as follows: ...@@ -45,14 +45,14 @@ Further update the model as follows:
... ...
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
assert self.vision_encoder is not None assert self.vision_encoder is not None
image_features = self.vision_encoder(image_input) image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features) return self.multi_modal_projector(image_features)
def get_multimodal_embeddings( def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: self,
**kwargs: object,
) -> MultiModalEmbeddings | None:
# Validate the multimodal input keyword arguments # Validate the multimodal input keyword arguments
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
...@@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m ...@@ -110,7 +110,7 @@ to return the maximum number of input items for each modality supported by the m
For example, if the model supports any number of images but only one video per prompt: For example, if the model supports any number of images but only one video per prompt:
```python ```python
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"image": None, "video": 1} return {"image": None, "video": 1}
``` ```
...@@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in ...@@ -258,7 +258,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, mm_options: Mapping[str, BaseDummyOptions] | None = None,
) -> MultiModalDataDict: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
...@@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in ...@@ -421,8 +421,10 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
```python ```python
def get_image_size_with_most_features(self) -> ImageSize: def get_image_size_with_most_features(self) -> ImageSize:
image_processor = self.get_image_processor() image_processor = self.get_image_processor()
return ImageSize(width=image_processor.size["width"], return ImageSize(
height=image_processor.size["height"]) width=image_processor.size["width"],
height=image_processor.size["height"],
)
``` ```
Fuyu does not expect image placeholders in the inputs to HF processor, so Fuyu does not expect image placeholders in the inputs to HF processor, so
...@@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in ...@@ -452,10 +454,12 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
return { return {
"image": "image":
self._get_dummy_images(width=target_width, self._get_dummy_images(
width=target_width,
height=target_height, height=target_height,
num_images=num_images, num_images=num_images,
overrides=image_overrides) overrides=image_overrides,
)
} }
``` ```
...@@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies ...@@ -744,8 +748,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_width=image_size.width, image_width=image_size.width,
image_height=image_size.height, image_height=image_size.height,
) )
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
[_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id( return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id], image_tokens + [bos_token_id],
...@@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies ...@@ -781,8 +784,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
image_width=image_size.width, image_width=image_size.width,
image_height=image_size.height, image_height=image_size.height,
) )
image_tokens = ([_IMAGE_TOKEN_ID] * ncols + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
[_NEWLINE_TOKEN_ID]) * nrows
return PromptUpdateDetails.select_token_id( return PromptUpdateDetails.select_token_id(
image_tokens + [bos_token_id], image_tokens + [bos_token_id],
...@@ -810,9 +812,11 @@ to register them to the multi-modal registry: ...@@ -810,9 +812,11 @@ to register them to the multi-modal registry:
from vllm.model_executor.models.interfaces import SupportsMultiModal from vllm.model_executor.models.interfaces import SupportsMultiModal
+ from vllm.multimodal import MULTIMODAL_REGISTRY + from vllm.multimodal import MULTIMODAL_REGISTRY
+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor, + @MULTIMODAL_REGISTRY.register_processor(
+ YourMultiModalProcessor,
+ info=YourProcessingInfo, + info=YourProcessingInfo,
+ dummy_inputs=YourDummyInputsBuilder) + dummy_inputs=YourDummyInputsBuilder,
+ )
class YourModelForImage2Seq(nn.Module, SupportsMultiModal): class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
``` ```
......
...@@ -42,7 +42,7 @@ def register(): ...@@ -42,7 +42,7 @@ def register():
ModelRegistry.register_model( ModelRegistry.register_model(
"YourModelForCausalLM", "YourModelForCausalLM",
"your_code:YourModelForCausalLM" "your_code:YourModelForCausalLM",
) )
``` ```
......
...@@ -15,6 +15,7 @@ Declare supported languages and capabilities: ...@@ -15,6 +15,7 @@ Declare supported languages and capabilities:
- Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper). - Set `supports_transcription_only=True` if the model should not serve text generation (eg Whisper).
??? code "supported_languages and supports_transcription_only" ??? code "supported_languages and supports_transcription_only"
```python ```python
from typing import ClassVar, Mapping, Literal from typing import ClassVar, Mapping, Literal
import numpy as np import numpy as np
...@@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor ...@@ -43,6 +44,7 @@ Provide an ASR configuration via [get_speech_to_text_config][vllm.model_executor
This is for controlling general behavior of the API when serving your model: This is for controlling general behavior of the API when serving your model:
??? code "get_speech_to_text_config()" ??? code "get_speech_to_text_config()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
...@@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo ...@@ -71,6 +73,7 @@ Implement the prompt construction via [get_generation_prompt][vllm.model_executo
Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`: Return a dict containing `multi_modal_data` with the audio, and either a `prompt` string or `prompt_token_ids`:
??? code "get_generation_prompt()" ??? code "get_generation_prompt()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
...@@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt ...@@ -107,6 +110,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
??? code "get_generation_prompt()" ??? code "get_generation_prompt()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
...@@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface ...@@ -148,12 +152,16 @@ Language validation via [validate_language][vllm.model_executor.models.interface
If your model requires a language and you want a default, override this method (see Whisper): If your model requires a language and you want a default, override this method (see Whisper):
??? code "validate_language()" ??? code "validate_language()"
```python ```python
@classmethod @classmethod
def validate_language(cls, language: str | None) -> str | None: def validate_language(cls, language: str | None) -> str | None:
if language is None: if language is None:
logger.warning( logger.warning(
"Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.") "Defaulting to language='en'. If you wish to transcribe "
"audio in a different language, pass the `language` field "
"in the TranscriptionRequest."
)
language = "en" language = "en"
return super().validate_language(language) return super().validate_language(language)
``` ```
...@@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo ...@@ -165,6 +173,7 @@ Token accounting for streaming via [get_num_audio_tokens][vllm.model_executor.mo
Provide a fast duration→token estimate to improve streaming usage statistics: Provide a fast duration→token estimate to improve streaming usage statistics:
??? code "get_num_audio_tokens()" ??? code "get_num_audio_tokens()"
```python ```python
class YourASRModel(nn.Module, SupportsTranscription): class YourASRModel(nn.Module, SupportsTranscription):
... ...
...@@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi ...@@ -191,6 +200,7 @@ The API server takes care of basic audio I/O and optional chunking before buildi
Relevant server logic: Relevant server logic:
??? code "_preprocess_speech_to_text()" ??? code "_preprocess_speech_to_text()"
```python ```python
# vllm/entrypoints/openai/speech_to_text.py # vllm/entrypoints/openai/speech_to_text.py
async def _preprocess_speech_to_text(...): async def _preprocess_speech_to_text(...):
......
...@@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference ...@@ -63,7 +63,7 @@ If successful, you should be returned a CURL command that you can call inference
??? console "Command" ??? console "Command"
```python ```bash
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-H 'Authorization: <JWT TOKEN>' \ -H 'Authorization: <JWT TOKEN>' \
...@@ -81,7 +81,7 @@ You should get a response like: ...@@ -81,7 +81,7 @@ You should get a response like:
??? console "Response" ??? console "Response"
```python ```json
{ {
"run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262",
"result": { "result": {
......
...@@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK: ...@@ -83,7 +83,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
client = OpenAI( client = OpenAI(
base_url="https://gateway.<gateway domain>", base_url="https://gateway.<gateway domain>",
api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>" api_key="<YOUR-DSTACK-SERVER-ACCESS-TOKEN>",
) )
completion = client.chat.completions.create( completion = client.chat.completions.create(
...@@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK: ...@@ -93,7 +93,7 @@ After the provisioning, you can interact with the model by using the OpenAI SDK:
"role": "user", "role": "user",
"content": "Compose a poem that explains the concept of recursion in programming.", "content": "Compose a poem that explains the concept of recursion in programming.",
} }
] ],
) )
print(completion.choices[0].message.content) print(completion.choices[0].message.content)
......
...@@ -34,7 +34,7 @@ pip install vllm haystack-ai ...@@ -34,7 +34,7 @@ pip install vllm haystack-ai
api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"), api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
model="mistralai/Mistral-7B-Instruct-v0.1", model="mistralai/Mistral-7B-Instruct-v0.1",
api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1", api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
generation_kwargs = {"max_tokens": 512} generation_kwargs={"max_tokens": 512},
) )
response = generator.run( response = generator.run(
......
...@@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo ...@@ -32,28 +32,28 @@ This is the easiest way to get started with vLLM on Hugging Face Inference Endpo
import os import os
client = OpenAI( client = OpenAI(
base_url = DEPLOYMENT_URL, base_url=DEPLOYMENT_URL,
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens
) )
chat_completion = client.chat.completions.create( chat_completion = client.chat.completions.create(
model = "HuggingFaceTB/SmolLM3-3B", model="HuggingFaceTB/SmolLM3-3B",
messages = [ messages=[
{ {
"role": "user", "role": "user",
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": "Give me a brief explanation of gravity in simple terms." "text": "Give me a brief explanation of gravity in simple terms.",
} }
] ],
} }
], ],
stream = True stream=True,
) )
for message in chat_completion: for message in chat_completion:
print(message.choices[0].delta.content, end = "") print(message.choices[0].delta.content, end="")
``` ```
!!! note !!! note
...@@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg ...@@ -86,34 +86,34 @@ This method applies to models with the [`transformers` library tag](https://hugg
import os import os
client = OpenAI( client = OpenAI(
base_url = DEPLOYMENT_URL, base_url=DEPLOYMENT_URL,
api_key = os.environ["HF_TOKEN"] # https://huggingface.co/settings/tokens api_key=os.environ["HF_TOKEN"], # https://huggingface.co/settings/tokens
) )
chat_completion = client.chat.completions.create( chat_completion = client.chat.completions.create(
model = "ibm-granite/granite-docling-258M", model="ibm-granite/granite-docling-258M",
messages = [ messages=[
{ {
"role": "user", "role": "user",
"content": [ "content": [
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png" "url": "https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/assets/new_arxiv.png",
} },
}, },
{ {
"type": "text", "type": "text",
"text": "Convert this page to docling." "text": "Convert this page to docling.",
} },
] ]
} }
], ],
stream = True stream=True,
) )
for message in chat_completion: for message in chat_completion:
print(message.choices[0].delta.content, end = "") print(message.choices[0].delta.content, end="")
``` ```
!!! note !!! note
......
...@@ -36,7 +36,7 @@ pip install vllm litellm ...@@ -36,7 +36,7 @@ pip install vllm litellm
```python ```python
import litellm import litellm
messages = [{ "content": "Hello, how are you?","role": "user"}] messages = [{"content": "Hello, how are you?", "role": "user"}]
# hosted_vllm is prefix key word and necessary # hosted_vllm is prefix key word and necessary
response = litellm.completion( response = litellm.completion(
...@@ -44,7 +44,8 @@ pip install vllm litellm ...@@ -44,7 +44,8 @@ pip install vllm litellm
messages=messages, messages=messages,
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1", api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
temperature=0.2, temperature=0.2,
max_tokens=80) max_tokens=80,
)
print(response) print(response)
``` ```
......
...@@ -40,7 +40,7 @@ pip install -U vllm \ ...@@ -40,7 +40,7 @@ pip install -U vllm \
1. Run the script 1. Run the script
```python ```bash
python retrieval_augmented_generation_with_langchain.py python retrieval_augmented_generation_with_langchain.py
``` ```
...@@ -78,6 +78,6 @@ pip install vllm \ ...@@ -78,6 +78,6 @@ pip install vllm \
1. Run the script: 1. Run the script:
```python ```bash
python retrieval_augmented_generation_with_llamaindex.py python retrieval_augmented_generation_with_llamaindex.py
``` ```
...@@ -106,9 +106,11 @@ The dispatch code looks like: ...@@ -106,9 +106,11 @@ The dispatch code looks like:
batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...) batch_descriptor=BatchDescriptor(num_tokens=num_input_tokens, uniform_decode=...)
runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor) runtime_mode, batch_descriptor = cudagraphdispatcher.dispatch(batch_descriptor)
# execution # execution
with set_forward_context(..., with set_forward_context(
...,
cudagraph_runtime_mode=runtime_mode, cudagraph_runtime_mode=runtime_mode,
batch_descriptor=batch_descriptor): batch_descriptor=batch_descriptor,
):
output = self.model(...) output = self.model(...)
``` ```
...@@ -203,9 +205,9 @@ from vllm.config import CUDAGraphMode ...@@ -203,9 +205,9 @@ from vllm.config import CUDAGraphMode
compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"} compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
model = vllm.LLM( model = vllm.LLM(
model="meta-llama/Llama-3.1-8B-Instruct", model="meta-llama/Llama-3.1-8B-Instruct",
dtype='auto', dtype="auto",
compilation_config = compilation_config, compilation_config=compilation_config,
) )
sampling_params = vllm.SamplingParams( sampling_params = vllm.SamplingParams(
temperature=0, # greedy decoding temperature=0, # greedy decoding
max_tokens=1024, max_tokens=1024,
......
...@@ -9,8 +9,8 @@ When performing an inference with IO Processor plugins, the prompt type is defin ...@@ -9,8 +9,8 @@ When performing an inference with IO Processor plugins, the prompt type is defin
IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>): IO Processor plugins implement the `IOProcessor` interface (<gh-file:vllm/plugins/io_processors/interface.py>):
```python ```python
IOProcessorInput = TypeVar('IOProcessorInput') IOProcessorInput = TypeVar("IOProcessorInput")
IOProcessorOutput = TypeVar('IOProcessorOutput') IOProcessorOutput = TypeVar("IOProcessorOutput")
class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
...@@ -21,30 +21,32 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): ...@@ -21,30 +21,32 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
def pre_process( def pre_process(
self, self,
prompt: IOProcessorInput, prompt: IOProcessorInput,
request_id: Optional[str] = None, request_id: str | None = None,
**kwargs, **kwargs,
) -> Union[PromptType, Sequence[PromptType]]: ) -> PromptType | Sequence[PromptType]:
raise NotImplementedError raise NotImplementedError
async def pre_process_async( async def pre_process_async(
self, self,
prompt: IOProcessorInput, prompt: IOProcessorInput,
request_id: Optional[str] = None, request_id: str | None = None,
**kwargs, **kwargs,
) -> Union[PromptType, Sequence[PromptType]]: ) -> PromptType | Sequence[PromptType]:
return self.pre_process(prompt, request_id, **kwargs) return self.pre_process(prompt, request_id, **kwargs)
@abstractmethod @abstractmethod
def post_process(self, def post_process(
self,
model_output: Sequence[PoolingRequestOutput], model_output: Sequence[PoolingRequestOutput],
request_id: Optional[str] = None, request_id: str | None = None,
**kwargs) -> IOProcessorOutput: **kwargs,
) -> IOProcessorOutput:
raise NotImplementedError raise NotImplementedError
async def post_process_async( async def post_process_async(
self, self,
model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]], model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
request_id: Optional[str] = None, request_id: str | None = None,
**kwargs, **kwargs,
) -> IOProcessorOutput: ) -> IOProcessorOutput:
collected_output = [item async for i, item in model_output] collected_output = [item async for i, item in model_output]
...@@ -56,7 +58,8 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): ...@@ -56,7 +58,8 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
@abstractmethod @abstractmethod
def output_to_response( def output_to_response(
self, plugin_output: IOProcessorOutput) -> IOProcessorResponse: self, plugin_output: IOProcessorOutput
) -> IOProcessorResponse:
raise NotImplementedError raise NotImplementedError
``` ```
......
...@@ -478,15 +478,17 @@ us with: ...@@ -478,15 +478,17 @@ us with:
```python ```python
if seq_group.is_finished(): if seq_group.is_finished():
if (seq_group.metrics.first_scheduled_time is not None and if (
seq_group.metrics.first_token_time is not None): seq_group.metrics.first_scheduled_time is not None
and seq_group.metrics.first_token_time is not None
):
time_queue_requests.append( time_queue_requests.append(
seq_group.metrics.first_scheduled_time - seq_group.metrics.first_scheduled_time -
seq_group.metrics.arrival_time) seq_group.metrics.arrival_time
)
... ...
if seq_group.metrics.time_in_queue is not None: if seq_group.metrics.time_in_queue is not None:
time_in_queue_requests.append( time_in_queue_requests.append(seq_group.metrics.time_in_queue)
seq_group.metrics.time_in_queue)
``` ```
This seems duplicative, and one of them should be removed. The latter This seems duplicative, and one of them should be removed. The latter
......
...@@ -112,8 +112,8 @@ class KVCacheBlock: ...@@ -112,8 +112,8 @@ class KVCacheBlock:
ref_cnt: int ref_cnt: int
# The pointers to form a doubly linked list for the free queue. # The pointers to form a doubly linked list for the free queue.
prev_free_block: Optional["KVCacheBlock"] = None prev_free_block: "KVCacheBlock | None" = None
next_free_block: Optional["KVCacheBlock"] = None next_free_block: "KVCacheBlock | None" = None
``` ```
There are two design points to highlight: There are two design points to highlight:
......
...@@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter. ...@@ -32,7 +32,7 @@ the third parameter is the path to the LoRA adapter.
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, temperature=0,
max_tokens=256, max_tokens=256,
stop=["[/assistant]"] stop=["[/assistant]"],
) )
prompts = [ prompts = [
...@@ -43,7 +43,7 @@ the third parameter is the path to the LoRA adapter. ...@@ -43,7 +43,7 @@ the third parameter is the path to the LoRA adapter.
outputs = llm.generate( outputs = llm.generate(
prompts, prompts,
sampling_params, sampling_params,
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) lora_request=LoRARequest("sql_adapter", 1, sql_lora_path),
) )
``` ```
...@@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin: ...@@ -197,7 +197,7 @@ Alternatively, follow these example steps to implement your own plugin:
lora_request = LoRARequest( lora_request = LoRARequest(
lora_name=lora_name, lora_name=lora_name,
lora_path=local_path, lora_path=local_path,
lora_int_id=abs(hash(lora_name)) lora_int_id=abs(hash(lora_name)),
) )
return lora_request return lora_request
``` ```
...@@ -296,10 +296,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au ...@@ -296,10 +296,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
if has_audio: if has_audio:
question = f"<|audio|>{question}" question = f"<|audio|>{question}"
chat = [ chat = [
{ {"role": "user", "content": question},
"role": "user",
"content": question
}
] ]
return tokenizer.apply_chat_template(chat, tokenize=False) return tokenizer.apply_chat_template(chat, tokenize=False)
......
...@@ -154,9 +154,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis ...@@ -154,9 +154,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
outputs = llm.generate({ outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
"multi_modal_data": { "multi_modal_data": {"image": [image1, image2]},
"image": [image1, image2]
},
}) })
for o in outputs: for o in outputs:
...@@ -183,21 +181,24 @@ conversation = [ ...@@ -183,21 +181,24 @@ conversation = [
{"role": "assistant", "content": "Hello! How can I assist you today?"}, {"role": "assistant", "content": "Hello! How can I assist you today?"},
{ {
"role": "user", "role": "user",
"content": [{ "content": [
{
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {"url": image_url},
"url": image_url },
} {
},{
"type": "image_pil", "type": "image_pil",
"image_pil": image_pil "image_pil": image_pil,
}, { },
{
"type": "image_embeds", "type": "image_embeds",
"image_embeds": image_embeds "image_embeds": image_embeds,
}, { },
{
"type": "text", "type": "text",
"text": "What's in these images?" "text": "What's in these images?",
}], },
],
}, },
] ]
...@@ -224,7 +225,10 @@ Multi-image input can be extended to perform video captioning. We show this with ...@@ -224,7 +225,10 @@ Multi-image input can be extended to perform video captioning. We show this with
message = { message = {
"role": "user", "role": "user",
"content": [ "content": [
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, {
"type": "text",
"text": "Describe this set of frames. Consider the frames to be a part of the same video.",
},
], ],
} }
for i in range(len(video_frames)): for i in range(len(video_frames)):
...@@ -255,13 +259,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f ...@@ -255,13 +259,13 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
# Custom black background for dark theme # Custom black background for dark theme
llm = LLM( llm = LLM(
model="llava-hf/llava-1.5-7b-hf", model="llava-hf/llava-1.5-7b-hf",
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}} media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}},
) )
# Custom brand color background (e.g., blue) # Custom brand color background (e.g., blue)
llm = LLM( llm = LLM(
model="llava-hf/llava-1.5-7b-hf", model="llava-hf/llava-1.5-7b-hf",
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}} media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}},
) )
``` ```
...@@ -294,20 +298,23 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown ...@@ -294,20 +298,23 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
limit_mm_per_prompt={"video": 1}, limit_mm_per_prompt={"video": 1},
) )
sampling_params = SamplingParams( sampling_params = SamplingParams(max_tokens=1024)
max_tokens=1024,
)
video_messages = [ video_messages = [
{"role": "system", "content": "You are a helpful assistant."}, {
{"role": "user", "content": [ "role": "system",
"content": "You are a helpful assistant.",
},
{
"role": "user",
"content": [
{"type": "text", "text": "describe this video."}, {"type": "text", "text": "describe this video."},
{ {
"type": "video", "type": "video",
"video": video_path, "video": video_path,
"total_pixels": 20480 * 28 * 28, "total_pixels": 20480 * 28 * 28,
"min_pixels": 16 * 28 * 28 "min_pixels": 16 * 28 * 28,
} },
] ]
}, },
] ]
...@@ -465,21 +472,24 @@ Then, you can use the OpenAI client as follows: ...@@ -465,21 +472,24 @@ Then, you can use the OpenAI client as follows:
chat_response = client.chat.completions.create( chat_response = client.chat.completions.create(
model="microsoft/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
messages=[{ messages=[
{
"role": "user", "role": "user",
"content": [ "content": [
# NOTE: The prompt formatting with the image token `<image>` is not needed # NOTE: The prompt formatting with the image token `<image>` is not needed
# since the prompt will be processed automatically by the API server. # since the prompt will be processed automatically by the API server.
{"type": "text", "text": "What’s in this image?"},
{ {
"type": "image_url", "type": "text",
"image_url": { "text": "What’s in this image?",
url": image_url
}, },
"uuid": image_url # Optional {
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_url, # Optional
}, },
], ],
}], }
],
) )
print("Chat completion output:", chat_response.choices[0].message.content) print("Chat completion output:", chat_response.choices[0].message.content)
...@@ -489,26 +499,27 @@ Then, you can use the OpenAI client as follows: ...@@ -489,26 +499,27 @@ Then, you can use the OpenAI client as follows:
chat_response = client.chat.completions.create( chat_response = client.chat.completions.create(
model="microsoft/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
messages=[{ messages=[
{
"role": "user", "role": "user",
"content": [ "content": [
{"type": "text", "text": "What are the animals in these images?"},
{ {
"type": "image_url", "type": "text",
"image_url": { "text": "What are the animals in these images?",
"url": image_url_duck
},
"uuid": image_url_duck # Optional
}, },
{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {"url": image_url_duck},
"url": image_url_lion "uuid": image_url_duck, # Optional
}, },
"uuid": image_url_lion # Optional {
"type": "image_url",
"image_url": {"url": image_url_lion},
"uuid": image_url_lion, # Optional
}, },
], ],
}], }
],
) )
print("Chat completion output:", chat_response.choices[0].message.content) print("Chat completion output:", chat_response.choices[0].message.content)
``` ```
...@@ -560,23 +571,22 @@ Then, you can use the OpenAI client as follows: ...@@ -560,23 +571,22 @@ Then, you can use the OpenAI client as follows:
## Use video url in the payload ## Use video url in the payload
chat_completion_from_url = client.chat.completions.create( chat_completion_from_url = client.chat.completions.create(
messages=[{ messages=[
"role": {
"user", "role": "user",
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": "What's in this video?" "text": "What's in this video?",
}, },
{ {
"type": "video_url", "type": "video_url",
"video_url": { "video_url": {"url": video_url},
"url": video_url "uuid": video_url, # Optional
},
"uuid": video_url # Optional
}, },
], ],
}], }
],
model=model, model=model,
max_completion_tokens=64, max_completion_tokens=64,
) )
...@@ -652,23 +662,25 @@ Then, you can use the OpenAI client as follows: ...@@ -652,23 +662,25 @@ Then, you can use the OpenAI client as follows:
audio_base64 = encode_base64_content_from_url(audio_url) audio_base64 = encode_base64_content_from_url(audio_url)
chat_completion_from_base64 = client.chat.completions.create( chat_completion_from_base64 = client.chat.completions.create(
messages=[{ messages=[
{
"role": "user", "role": "user",
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": "What's in this audio?" "text": "What's in this audio?",
}, },
{ {
"type": "input_audio", "type": "input_audio",
"input_audio": { "input_audio": {
"data": audio_base64, "data": audio_base64,
"format": "wav" "format": "wav",
}, },
"uuid": audio_url # Optional "uuid": audio_url, # Optional
},
],
}, },
], ],
}],
model=model, model=model,
max_completion_tokens=64, max_completion_tokens=64,
) )
...@@ -683,22 +695,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag ...@@ -683,22 +695,22 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
```python ```python
chat_completion_from_url = client.chat.completions.create( chat_completion_from_url = client.chat.completions.create(
messages=[{ messages=[
{
"role": "user", "role": "user",
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": "What's in this audio?" "text": "What's in this audio?",
}, },
{ {
"type": "audio_url", "type": "audio_url",
"audio_url": { "audio_url": {"url": audio_url},
"url": audio_url "uuid": audio_url, # Optional
},
"uuid": audio_url # Optional
}, },
], ],
}], }
],
model=model, model=model,
max_completion_tokens=64, max_completion_tokens=64,
) )
...@@ -750,7 +762,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se ...@@ -750,7 +762,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
embeds = { embeds = {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": f"{base64_image_embedding}", "image_embeds": f"{base64_image_embedding}",
"uuid": image_url # Optional "uuid": image_url, # Optional
} }
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V) # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
...@@ -758,24 +770,29 @@ The following example demonstrates how to pass image embeddings to the OpenAI se ...@@ -758,24 +770,29 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
embeds = { embeds = {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": { "image_embeds": {
"image_embeds": f"{base64_image_embedding}" , # Required "image_embeds": f"{base64_image_embedding}", # Required
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct "image_grid_thw": f"{base64_image_grid_thw}", # Required by Qwen/Qwen2-VL-2B-Instruct
}, },
"uuid": image_url # Optional "uuid": image_url, # Optional
} }
model = "openbmb/MiniCPM-V-2_6" model = "openbmb/MiniCPM-V-2_6"
embeds = { embeds = {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": { "image_embeds": {
"image_embeds": f"{base64_image_embedding}" , # Required "image_embeds": f"{base64_image_embedding}", # Required
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6 "image_sizes": f"{base64_image_sizes}", # Required by openbmb/MiniCPM-V-2_6
}, },
"uuid": image_url # Optional "uuid": image_url, # Optional
} }
chat_completion = client.chat.completions.create( chat_completion = client.chat.completions.create(
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant."}, {
{"role": "user", "content": [ "role": "system",
"content": "You are a helpful assistant.",
},
{
"role": "user",
"content": [
{ {
"type": "text", "type": "text",
"text": "What's in this image?", "text": "What's in this image?",
...@@ -802,22 +819,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit ...@@ -802,22 +819,22 @@ For Online Serving, you can also skip sending media if you expect cache hits wit
{ {
"type": "image_embeds", "type": "image_embeds",
"image_embeds": None, "image_embeds": None,
"uuid": image_uuid "uuid": image_uuid,
}, },
# input_audio: # input_audio:
{ {
"type": "input_audio", "type": "input_audio",
"input_audio": None, "input_audio": None,
"uuid": audio_uuid "uuid": audio_uuid,
}, },
# PIL Image: # PIL Image:
{ {
"type": "image_pil", "type": "image_pil",
"image_pil": None "image_pil": None,
"uuid": image_uuid "uuid": image_uuid,
} },
``` ```
......
...@@ -117,9 +117,11 @@ OpenAI Python client library does not officially support `reasoning_content` att ...@@ -117,9 +117,11 @@ OpenAI Python client library does not officially support `reasoning_content` att
# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}` # For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
# For Qwen3 series, if you want to disable thinking in reasoning mode, add: # For Qwen3 series, if you want to disable thinking in reasoning mode, add:
# extra_body={"chat_template_kwargs": {"enable_thinking": False}} # extra_body={"chat_template_kwargs": {"enable_thinking": False}}
stream = client.chat.completions.create(model=model, stream = client.chat.completions.create(
model=model,
messages=messages, messages=messages,
stream=True) stream=True,
)
print("client: Start streaming chat completions...") print("client: Start streaming chat completions...")
printed_reasoning_content = False printed_reasoning_content = False
...@@ -159,7 +161,8 @@ The reasoning content is also available when both tool calling and the reasoning ...@@ -159,7 +161,8 @@ The reasoning content is also available when both tool calling and the reasoning
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy") client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
tools = [{ tools = [
{
"type": "function", "type": "function",
"function": { "function": {
"name": "get_weather", "name": "get_weather",
...@@ -168,18 +171,19 @@ The reasoning content is also available when both tool calling and the reasoning ...@@ -168,18 +171,19 @@ The reasoning content is also available when both tool calling and the reasoning
"type": "object", "type": "object",
"properties": { "properties": {
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
}, },
"required": ["location", "unit"] "required": ["location", "unit"],
} }
},
} }
}] ]
response = client.chat.completions.create( response = client.chat.completions.create(
model=client.models.list().data[0].id, model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools, tools=tools,
tool_choice="auto" tool_choice="auto",
) )
print(response) print(response)
...@@ -225,7 +229,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_ ...@@ -225,7 +229,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
previous_token_ids: Sequence[int], previous_token_ids: Sequence[int],
current_token_ids: Sequence[int], current_token_ids: Sequence[int],
delta_token_ids: Sequence[int], delta_token_ids: Sequence[int],
) -> Union[DeltaMessage, None]: ) -> DeltaMessage | None:
""" """
Instance method that should be implemented for extracting reasoning Instance method that should be implemented for extracting reasoning
from an incomplete response; for use when handling reasoning calls and from an incomplete response; for use when handling reasoning calls and
...@@ -235,8 +239,10 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_ ...@@ -235,8 +239,10 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
""" """
def extract_reasoning_content( def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest self,
) -> tuple[Optional[str], Optional[str]]: model_output: str,
request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]:
""" """
Extract reasoning content from a complete model-generated string. Extract reasoning content from a complete model-generated string.
...@@ -274,10 +280,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner ...@@ -274,10 +280,10 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
@classmethod @classmethod
def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner: def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
return cls(start_token_id=tokenizer.encode( return cls(
"<think>", add_special_tokens=False)[0], start_token_id=tokenizer.encode("<think>", add_special_tokens=False)[0],
end_token_id=tokenizer.encode("</think>", end_token_id=tokenizer.encode("</think>", add_special_tokens=False)[0],
add_special_tokens=False)[0]) )
def is_reasoning_end(self, input_ids: list[int]) -> bool: def is_reasoning_end(self, input_ids: list[int]) -> bool:
return self.end_token_id in input_ids return self.end_token_id in input_ids
......
...@@ -27,7 +27,8 @@ Next, make a request that triggers the model to use the available tools: ...@@ -27,7 +27,8 @@ Next, make a request that triggers the model to use the available tools:
return f"Getting the weather for {location} in {unit}..." return f"Getting the weather for {location} in {unit}..."
tool_functions = {"get_weather": get_weather} tool_functions = {"get_weather": get_weather}
tools = [{ tools = [
{
"type": "function", "type": "function",
"function": { "function": {
"name": "get_weather", "name": "get_weather",
...@@ -38,16 +39,17 @@ Next, make a request that triggers the model to use the available tools: ...@@ -38,16 +39,17 @@ Next, make a request that triggers the model to use the available tools:
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"}, "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]} "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
}, },
"required": ["location", "unit"] "required": ["location", "unit"],
} },
} },
}] },
]
response = client.chat.completions.create( response = client.chat.completions.create(
model=client.models.list().data[0].id, model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}], messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools, tools=tools,
tool_choice="auto" tool_choice="auto",
) )
tool_call = response.choices[0].message.tool_calls[0].function tool_call = response.choices[0].message.tool_calls[0].function
...@@ -402,8 +404,7 @@ Here is a summary of a plugin file: ...@@ -402,8 +404,7 @@ Here is a summary of a plugin file:
# adjust request. e.g.: set skip special tokens # adjust request. e.g.: set skip special tokens
# to False for tool call output. # to False for tool call output.
def adjust_request( def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
self, request: ChatCompletionRequest) -> ChatCompletionRequest:
return request return request
# implement the tool call parse for stream call # implement the tool call parse for stream call
...@@ -416,7 +417,7 @@ Here is a summary of a plugin file: ...@@ -416,7 +417,7 @@ Here is a summary of a plugin file:
current_token_ids: Sequence[int], current_token_ids: Sequence[int],
delta_token_ids: Sequence[int], delta_token_ids: Sequence[int],
request: ChatCompletionRequest, request: ChatCompletionRequest,
) -> Union[DeltaMessage, None]: ) -> DeltaMessage | None:
return delta return delta
# implement the tool parse for non-stream call # implement the tool parse for non-stream call
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment