Commit 711aa9d5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.0' into v0.10.0-dev

parents 751c492c 6d8d0a24
...@@ -39,6 +39,8 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link . ...@@ -39,6 +39,8 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
:root { :root {
--md-admonition-icon--announcement: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path d="M3.25 9a.75.75 0 0 1 .75.75c0 2.142.456 3.828.733 4.653a.122.122 0 0 0 .05.064.212.212 0 0 0 .117.033h1.31c.085 0 .18-.042.258-.152a.45.45 0 0 0 .075-.366A16.743 16.743 0 0 1 6 9.75a.75.75 0 0 1 1.5 0c0 1.588.25 2.926.494 3.85.293 1.113-.504 2.4-1.783 2.4H4.9c-.686 0-1.35-.41-1.589-1.12A16.4 16.4 0 0 1 2.5 9.75.75.75 0 0 1 3.25 9Z"></path><path d="M0 6a4 4 0 0 1 4-4h2.75a.75.75 0 0 1 .75.75v6.5a.75.75 0 0 1-.75.75H4a4 4 0 0 1-4-4Zm4-2.5a2.5 2.5 0 1 0 0 5h2v-5Z"></path><path d="M15.59.082A.75.75 0 0 1 16 .75v10.5a.75.75 0 0 1-1.189.608l-.002-.001h.001l-.014-.01a5.775 5.775 0 0 0-.422-.25 10.63 10.63 0 0 0-1.469-.64C11.576 10.484 9.536 10 6.75 10a.75.75 0 0 1 0-1.5c2.964 0 5.174.516 6.658 1.043.423.151.787.302 1.092.443V2.014c-.305.14-.669.292-1.092.443C11.924 2.984 9.713 3.5 6.75 3.5a.75.75 0 0 1 0-1.5c2.786 0 4.826-.484 6.155-.957.665-.236 1.154-.47 1.47-.64.144-.077.284-.161.421-.25l.014-.01a.75.75 0 0 1 .78-.061Z"></path></svg>'); --md-admonition-icon--announcement: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path d="M3.25 9a.75.75 0 0 1 .75.75c0 2.142.456 3.828.733 4.653a.122.122 0 0 0 .05.064.212.212 0 0 0 .117.033h1.31c.085 0 .18-.042.258-.152a.45.45 0 0 0 .075-.366A16.743 16.743 0 0 1 6 9.75a.75.75 0 0 1 1.5 0c0 1.588.25 2.926.494 3.85.293 1.113-.504 2.4-1.783 2.4H4.9c-.686 0-1.35-.41-1.589-1.12A16.4 16.4 0 0 1 2.5 9.75.75.75 0 0 1 3.25 9Z"></path><path d="M0 6a4 4 0 0 1 4-4h2.75a.75.75 0 0 1 .75.75v6.5a.75.75 0 0 1-.75.75H4a4 4 0 0 1-4-4Zm4-2.5a2.5 2.5 0 1 0 0 5h2v-5Z"></path><path d="M15.59.082A.75.75 0 0 1 16 .75v10.5a.75.75 0 0 1-1.189.608l-.002-.001h.001l-.014-.01a5.775 5.775 0 0 0-.422-.25 10.63 10.63 0 0 0-1.469-.64C11.576 10.484 9.536 10 6.75 10a.75.75 0 0 1 0-1.5c2.964 0 5.174.516 6.658 1.043.423.151.787.302 1.092.443V2.014c-.305.14-.669.292-1.092.443C11.924 2.984 9.713 3.5 6.75 3.5a.75.75 0 0 1 0-1.5c2.786 0 4.826-.484 6.155-.957.665-.236 1.154-.47 1.47-.64.144-.077.284-.161.421-.25l.014-.01a.75.75 0 0 1 .78-.061Z"></path></svg>');
--md-admonition-icon--important: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path d="M4.47.22A.749.749 0 0 1 5 0h6c.199 0 .389.079.53.22l4.25 4.25c.141.14.22.331.22.53v6a.749.749 0 0 1-.22.53l-4.25 4.25A.749.749 0 0 1 11 16H5a.749.749 0 0 1-.53-.22L.22 11.53A.749.749 0 0 1 0 11V5c0-.199.079-.389.22-.53Zm.84 1.28L1.5 5.31v5.38l3.81 3.81h5.38l3.81-3.81V5.31L10.69 1.5ZM8 4a.75.75 0 0 1 .75.75v3.5a.75.75 0 0 1-1.5 0v-3.5A.75.75 0 0 1 8 4Zm0 8a1 1 0 1 1 0-2 1 1 0 0 1 0 2Z"></path></svg>'); --md-admonition-icon--important: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path d="M4.47.22A.749.749 0 0 1 5 0h6c.199 0 .389.079.53.22l4.25 4.25c.141.14.22.331.22.53v6a.749.749 0 0 1-.22.53l-4.25 4.25A.749.749 0 0 1 11 16H5a.749.749 0 0 1-.53-.22L.22 11.53A.749.749 0 0 1 0 11V5c0-.199.079-.389.22-.53Zm.84 1.28L1.5 5.31v5.38l3.81 3.81h5.38l3.81-3.81V5.31L10.69 1.5ZM8 4a.75.75 0 0 1 .75.75v3.5a.75.75 0 0 1-1.5 0v-3.5A.75.75 0 0 1 8 4Zm0 8a1 1 0 1 1 0-2 1 1 0 0 1 0 2Z"></path></svg>');
--md-admonition-icon--code: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path d="m11.28 3.22 4.25 4.25a.75.75 0 0 1 0 1.06l-4.25 4.25a.749.749 0 0 1-1.275-.326.75.75 0 0 1 .215-.734L13.94 8l-3.72-3.72a.749.749 0 0 1 .326-1.275.75.75 0 0 1 .734.215m-6.56 0a.75.75 0 0 1 1.042.018.75.75 0 0 1 .018 1.042L2.06 8l3.72 3.72a.749.749 0 0 1-.326 1.275.75.75 0 0 1-.734-.215L.47 8.53a.75.75 0 0 1 0-1.06Z"/></svg>');
--md-admonition-icon--console: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path d="M0 2.75C0 1.784.784 1 1.75 1h12.5c.966 0 1.75.784 1.75 1.75v10.5A1.75 1.75 0 0 1 14.25 15H1.75A1.75 1.75 0 0 1 0 13.25Zm1.75-.25a.25.25 0 0 0-.25.25v10.5c0 .138.112.25.25.25h12.5a.25.25 0 0 0 .25-.25V2.75a.25.25 0 0 0-.25-.25ZM7.25 8a.75.75 0 0 1-.22.53l-2.25 2.25a.749.749 0 0 1-1.275-.326.75.75 0 0 1 .215-.734L5.44 8 3.72 6.28a.749.749 0 0 1 .326-1.275.75.75 0 0 1 .734.215l2.25 2.25c.141.14.22.331.22.53m1.5 1.5h3a.75.75 0 0 1 0 1.5h-3a.75.75 0 0 1 0-1.5"/></svg>');
} }
.md-typeset .admonition.announcement, .md-typeset .admonition.announcement,
...@@ -49,6 +51,14 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link . ...@@ -49,6 +51,14 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
.md-typeset details.important { .md-typeset details.important {
border-color: rgb(239, 85, 82); border-color: rgb(239, 85, 82);
} }
.md-typeset .admonition.code,
.md-typeset details.code {
border-color: #64dd17
}
.md-typeset .admonition.console,
.md-typeset details.console {
border-color: #64dd17
}
.md-typeset .announcement > .admonition-title, .md-typeset .announcement > .admonition-title,
.md-typeset .announcement > summary { .md-typeset .announcement > summary {
...@@ -58,6 +68,14 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link . ...@@ -58,6 +68,14 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
.md-typeset .important > summary { .md-typeset .important > summary {
background-color: rgb(239, 85, 82, 0.1); background-color: rgb(239, 85, 82, 0.1);
} }
.md-typeset .code > .admonition-title,
.md-typeset .code > summary {
background-color: #64dd171a;
}
.md-typeset .console > .admonition-title,
.md-typeset .console > summary {
background-color: #64dd171a;
}
.md-typeset .announcement > .admonition-title::before, .md-typeset .announcement > .admonition-title::before,
.md-typeset .announcement > summary::before { .md-typeset .announcement > summary::before {
...@@ -71,6 +89,18 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link . ...@@ -71,6 +89,18 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
-webkit-mask-image: var(--md-admonition-icon--important); -webkit-mask-image: var(--md-admonition-icon--important);
mask-image: var(--md-admonition-icon--important); mask-image: var(--md-admonition-icon--important);
} }
.md-typeset .code > .admonition-title::before,
.md-typeset .code > summary::before {
background-color: #64dd17;
-webkit-mask-image: var(--md-admonition-icon--code);
mask-image: var(--md-admonition-icon--code);
}
.md-typeset .console > .admonition-title::before,
.md-typeset .console > summary::before {
background-color: #64dd17;
-webkit-mask-image: var(--md-admonition-icon--console);
mask-image: var(--md-admonition-icon--console);
}
/* Make label fully visible on hover */ /* Make label fully visible on hover */
.md-content__button[href*="edit"]:hover::after { .md-content__button[href*="edit"]:hover::after {
...@@ -143,3 +173,13 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link . ...@@ -143,3 +173,13 @@ body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .
[data-md-color-scheme="slate"] .logo-light { [data-md-color-scheme="slate"] .logo-light {
display: none; display: none;
} }
/* Outline for content tabs */
.md-typeset .tabbed-set {
border: 0.075rem solid var(--md-default-fg-color);
border-radius: 0.2rem;
}
.md-typeset .tabbed-content {
padding: 0 0.6em;
}
\ No newline at end of file
--- # Loading models with Run:ai Model Streamer
title: Loading models with Run:ai Model Streamer
---
[](){ #runai-model-streamer }
Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory. Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md). Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md).
......
--- # Loading models with CoreWeave's Tensorizer
title: Loading models with CoreWeave's Tensorizer
---
[](){ #tensorizer }
vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer). vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer).
vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
...@@ -10,7 +7,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor ...@@ -10,7 +7,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
For more information on CoreWeave's Tensorizer, please refer to For more information on CoreWeave's Tensorizer, please refer to
[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
the [vLLM example script](https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html). the [vLLM example script](../../examples/others/tensorize_vllm_model.md).
!!! note !!! note
Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
--- # Generative Models
title: Generative Models
---
[](){ #generative-models }
vLLM provides first-class support for generative models, which covers most of LLMs. vLLM provides first-class support for generative models, which covers most of LLMs.
...@@ -85,7 +82,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc ...@@ -85,7 +82,7 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
In general, only instruction-tuned models have a chat template. In general, only instruction-tuned models have a chat template.
Base models may perform poorly as they are not trained to respond to the chat conversation. Base models may perform poorly as they are not trained to respond to the chat conversation.
??? Code ??? code
```python ```python
from vllm import LLM from vllm import LLM
...@@ -134,7 +131,7 @@ outputs = llm.chat(conversation, chat_template=custom_template) ...@@ -134,7 +131,7 @@ outputs = llm.chat(conversation, chat_template=custom_template)
## Online Serving ## Online Serving
Our [OpenAI-Compatible Server][serving-openai-compatible-server] provides endpoints that correspond to the offline APIs: Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
- [Completions API][completions-api] is similar to `LLM.generate` but only accepts text. - [Completions API][completions-api] is similar to `LLM.generate` but only accepts text.
- [Chat API][chat-api] is similar to `LLM.chat`, accepting both text and [multi-modal inputs][multimodal-inputs] for models with a chat template. - [Chat API][chat-api] is similar to `LLM.chat`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for models with a chat template.
--- # TPU
title: TPU
---
[](){ #tpu-supported-models }
# TPU Supported Models # TPU Supported Models
## Text-only Language Models ## Text-only Language Models
......
--- # Pooling Models
title: Pooling Models
---
[](){ #pooling-models }
vLLM also supports pooling models, including embedding, reranking and reward models. vLLM also supports pooling models, including embedding, reranking and reward models.
...@@ -11,29 +8,54 @@ before returning them. ...@@ -11,29 +8,54 @@ before returning them.
!!! note !!! note
We currently support pooling models primarily as a matter of convenience. We currently support pooling models primarily as a matter of convenience.
As shown in the [Compatibility Matrix][compatibility-matrix], most vLLM features are not applicable to As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to
pooling models as they only work on the generation or decode stage, so performance may not improve as much. pooling models as they only work on the generation or decode stage, so performance may not improve as much.
For pooling models, we support the following `--task` options. If the model doesn't implement this interface, you can set `--task` which tells vLLM
The selected option sets the default pooler used to extract the final hidden states: to convert the model into a pooling model.
| Task | Pooling Type | Normalization | Softmax | | `--task` | Model type | Supported pooling tasks |
|---------------------------------|----------------|-----------------|-----------| |------------|----------------------|-------------------------------|
| Embedding (`embed`) | `LAST` | ✅︎ | ❌ | | `embed` | Embedding model | `encode`, `embed` |
| Classification (`classify`) | `LAST` | ❌ | ✅︎ | | `classify` | Classification model | `encode`, `classify`, `score` |
| Sentence Pair Scoring (`score`) | \* | \* | \* | | `reward` | Reward model | `encode` |
\*The default pooler is always defined by the model. ## Pooling Tasks
!!! note In vLLM, we define the following pooling tasks and corresponding APIs:
If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
| Task | APIs |
|------------|--------------------|
| `encode` | `encode` |
| `embed` | `embed`, `score`\* |
| `classify` | `classify` |
| `score` | `score` |
\*The `score` API falls back to `embed` task if the model does not support `score` task.
Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.Pooler.get_supported_tasks].
By default, the pooler assigned to each task has the following attributes:
| Task | Pooling Type | Normalization | Softmax |
|------------|----------------|---------------|---------|
| `encode` | `ALL` | ❌ | ❌ |
| `embed` | `LAST` | ✅︎ | ❌ |
| `classify` | `LAST` | ❌ | ✅︎ |
These defaults may be overridden by the model's implementation in vLLM.
When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`). we attempt to override the defaults based on its Sentence Transformers configuration file (`modules.json`),
which takes priority over the model's defaults.
You can further customize this via the `--override-pooler-config` option,
which takes priority over both the model's and Sentence Transformers's defaults.
!!! note
!!! tip The above configuration may be disregarded if the model's implementation in vLLM defines its own pooler
You can customize the model's pooling method via the `--override-pooler-config` option, that is not based on [PoolerConfig][vllm.config.PoolerConfig].
which takes priority over both the model's and Sentence Transformers's defaults.
## Offline Inference ## Offline Inference
...@@ -113,10 +135,10 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/scor ...@@ -113,10 +135,10 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/scor
## Online Serving ## Online Serving
Our [OpenAI-Compatible Server][serving-openai-compatible-server] provides endpoints that correspond to the offline APIs: Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs:
- [Pooling API][pooling-api] is similar to `LLM.encode`, being applicable to all types of pooling models. - [Pooling API][pooling-api] is similar to `LLM.encode`, being applicable to all types of pooling models.
- [Embeddings API][embeddings-api] is similar to `LLM.embed`, accepting both text and [multi-modal inputs][multimodal-inputs] for embedding models. - [Embeddings API][embeddings-api] is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models.
- [Classification API][classification-api] is similar to `LLM.classify` and is applicable to sequence classification models. - [Classification API][classification-api] is similar to `LLM.classify` and is applicable to sequence classification models.
- [Score API][score-api] is similar to `LLM.score` for cross-encoder models. - [Score API][score-api] is similar to `LLM.score` for cross-encoder models.
...@@ -152,11 +174,11 @@ You can change the output dimensions of embedding models that support Matryoshka ...@@ -152,11 +174,11 @@ You can change the output dimensions of embedding models that support Matryoshka
```python ```python
from vllm import LLM, PoolingParams from vllm import LLM, PoolingParams
model = LLM(model="jinaai/jina-embeddings-v3", llm = LLM(model="jinaai/jina-embeddings-v3",
task="embed", task="embed",
trust_remote_code=True) trust_remote_code=True)
outputs = model.embed(["Follow the white rabbit."], outputs = llm.embed(["Follow the white rabbit."],
pooling_params=PoolingParams(dimensions=32)) pooling_params=PoolingParams(dimensions=32))
print(outputs[0].outputs) print(outputs[0].outputs)
``` ```
......
--- # Supported Models
title: Supported Models
---
[](){ #supported-models }
vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks. vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks.
If a model supports more than one task, you can set the task via the `--task` argument. If a model supports more than one task, you can set the task via the `--task` argument.
...@@ -21,7 +18,7 @@ These models are what we list in [supported-text-models][supported-text-models] ...@@ -21,7 +18,7 @@ These models are what we list in [supported-text-models][supported-text-models]
### Transformers ### Transformers
vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned! vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs. Support for video inputs will be added in future releases.
To check if the modeling backend is Transformers, you can simply do this: To check if the modeling backend is Transformers, you can simply do this:
...@@ -31,14 +28,17 @@ llm = LLM(model=..., task="generate") # Name or path of your model ...@@ -31,14 +28,17 @@ llm = LLM(model=..., task="generate") # Name or path of your model
llm.apply_model(lambda model: print(type(model))) llm.apply_model(lambda model: print(type(model)))
``` ```
If it is `TransformersForCausalLM` then it means it's based on Transformers! If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers!
!!! tip !!! tip
You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference][offline-inference] or `--model-impl transformers` for the [openai-compatible-server][serving-openai-compatible-server]. You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md).
!!! note !!! note
vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM. vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
!!! note
In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
#### Custom models #### Custom models
If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM! If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
...@@ -53,8 +53,8 @@ For a model to be compatible with the Transformers backend for vLLM it must: ...@@ -53,8 +53,8 @@ For a model to be compatible with the Transformers backend for vLLM it must:
If the compatible model is: If the compatible model is:
- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference][offline-inference] or `--trust-remote-code` for the [openai-compatible-server][serving-openai-compatible-server]. - on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md).
- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference][offline-inference] or `vllm serve <MODEL_DIR>` for the [openai-compatible-server][serving-openai-compatible-server]. - in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference](../serving/offline_inference.md) or `vllm serve <MODEL_DIR>` for the [openai-compatible-server](../serving/openai_compatible_server.md).
This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM! This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
...@@ -102,7 +102,7 @@ Here is what happens in the background when this model is loaded: ...@@ -102,7 +102,7 @@ Here is what happens in the background when this model is loaded:
1. The config is loaded. 1. The config is loaded.
2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. 3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
That's it! That's it!
...@@ -171,7 +171,7 @@ The [Transformers backend][transformers-backend] enables you to run models direc ...@@ -171,7 +171,7 @@ The [Transformers backend][transformers-backend] enables you to run models direc
If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
Otherwise, please refer to [Adding a New Model][new-model] for instructions on how to implement your model in vLLM. Otherwise, please refer to [Adding a New Model](../contributing/model/README.md) for instructions on how to implement your model in vLLM.
Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
#### Download a model #### Download a model
...@@ -308,91 +308,104 @@ print(output) ...@@ -308,91 +308,104 @@ print(output)
### Generative Models ### Generative Models
See [this page][generative-models] for more information on how to use generative models. See [this page](generative_models.md) for more information on how to use generative models.
#### Text Generation #### Text Generation
Specified using `--task generate`. Specified using `--task generate`.
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | <style>
|---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------| th {
| `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | white-space: nowrap;
| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ | min-width: 0 !important;
| `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | }
| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | | </style>
| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
| `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ | |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ | | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ | | `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ |
| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. | | ✅︎ | ✅︎ | | `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. | | ✅︎ | ✅︎ | | `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. | | ✅︎ | ✅︎ | | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
| `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst` etc. | | ✅︎ | ✅︎ | | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
| `Ernie4_5_ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`,etc. | | ✅︎ | ✅︎ | | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. | | ✅︎ | ✅︎ | | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ | | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ | | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | | | `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
| `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ |
| `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ |
| `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
| `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | | `Ernie4_5_ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
| `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ | | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ | | `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ | | `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ |
| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ | | `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ |
| `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | | | `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ | | `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | | `Gemma3nForConditionalGeneration` | Gemma 3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | | `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`etc. | | | ✅︎ | | `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | ✅︎ |
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ | | `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ |
| `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | | | `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | | | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | | | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
| `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
| `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
| `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ | | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | | | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | ✅︎ | | `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | ✅︎ |
| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | ✅︎ | | `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | |
| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ | | `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ | | `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | |
| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ | | `Mamba2ForCausalLM` | Mamba2 | `mistralai/Mamba-Codestral-7B-v0.1`, etc. | | ✅︎ | ✅︎ |
| `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ | | `MiMoForCausalLM` | MiMo | `XiaomiMiMo/MiMo-7B-RL`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | | ✅︎ | ✅︎ | | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ | | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | | | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | ✅︎ | ✅︎ | | `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | ✅︎ |
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | ✅︎ |
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | ✅︎ | ✅︎ | | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ |
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ | | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ |
| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ | | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ |
| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ | | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Phi4FlashForCausalLM` | Phi-4-mini-flash-reasoning | `microsoft/microsoft/Phi-4-mini-instruct`, etc. | | | |
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`etc. | | | | | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ |
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | | | `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | |
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | | | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | |
| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | |
| `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
!!! note !!! note
Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
...@@ -412,19 +425,19 @@ See [this page](./pooling_models.md) for more information on how to use pooling ...@@ -412,19 +425,19 @@ See [this page](./pooling_models.md) for more information on how to use pooling
Specified using `--task embed`. Specified using `--task embed`.
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|----------------------|---------------------------|-----------------------| |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | | | `BertModel` | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
| `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ | | `Gemma2Model` | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ |
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
| `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | ︎ | | | | `GteModel` | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | |
| `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | ︎ | ︎ | | | `GteNewModel` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | |
| `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | ︎ | ︎ | | | `ModernBertModel` | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. | | | |
| `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ︎ | ︎ | | | `NomicBertModel` | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | | | |
| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2Model`, `Qwen2ForCausalLM` | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen3Model`, `Qwen3ForCausalLM` | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen3Model`, `Qwen3ForCausalLM` | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | | | `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | |
!!! note !!! note
`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
...@@ -448,12 +461,12 @@ of the whole prompt are extracted from the normalized hidden state corresponding ...@@ -448,12 +461,12 @@ of the whole prompt are extracted from the normalized hidden state corresponding
Specified using `--task reward`. Specified using `--task reward`.
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------| |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ | | `LlamaForCausalLM` | Llama-based | `peiyi9979/math-shepherd-mistral-7b-prm`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2ForProcessRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-PRM-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
If your model is not in the above list, we will try to automatically convert the model using If your model is not in the above list, we will try to automatically convert the model using
[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly. [as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
...@@ -466,10 +479,10 @@ If your model is not in the above list, we will try to automatically convert the ...@@ -466,10 +479,10 @@ If your model is not in the above list, we will try to automatically convert the
Specified using `--task classify`. Specified using `--task classify`.
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|-----------------------| |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | | | `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
| `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ | | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
If your model is not in the above list, we will try to automatically convert the model using If your model is not in the above list, we will try to automatically convert the model using
[as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. [as_seq_cls_model][vllm.model_executor.models.adapters.as_seq_cls_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
...@@ -478,13 +491,21 @@ If your model is not in the above list, we will try to automatically convert the ...@@ -478,13 +491,21 @@ If your model is not in the above list, we will try to automatically convert the
Specified using `--task score`. Specified using `--task score`.
| Architecture | Models | Example HF Models | [V1](gh-issue:8779) | | Architecture | Models | Example HF Models | [V1](gh-issue:8779) |
|---------------------------------------|-------------------|--------------------------------------------------------------------------------------|---------------------| |--------------|--------|-------------------|---------------------|
| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | |
| `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | |
| `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | | `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ |
| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ |
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | |
| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | |
!!! note
Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command.
```bash
vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}'
```
!!! note !!! note
Load the official original `mxbai-rerank-v2` by using the following command. Load the official original `mxbai-rerank-v2` by using the following command.
...@@ -519,7 +540,7 @@ On the other hand, modalities separated by `/` are mutually exclusive. ...@@ -519,7 +540,7 @@ On the other hand, modalities separated by `/` are mutually exclusive.
- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. - e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
See [this page][multimodal-inputs] on how to pass multi-modal inputs to the model. See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inputs to the model.
!!! important !!! important
**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference) **To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
...@@ -549,57 +570,58 @@ See [this page][multimodal-inputs] on how to pass multi-modal inputs to the mode ...@@ -549,57 +570,58 @@ See [this page][multimodal-inputs] on how to pass multi-modal inputs to the mode
### Generative Models ### Generative Models
See [this page][generative-models] for more information on how to use generative models. See [this page](generative_models.md) for more information on how to use generative models.
#### Text Generation #### Text Generation
Specified using `--task generate`. Specified using `--task generate`.
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|----------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------| |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ | | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | | ✅︎ |
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ | | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
| `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ | | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b` etc. | | ✅︎ | ✅︎ | | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ |
| `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. | | ✅︎ | ✅︎ | | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
| `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc. | | | | | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b` etc. | | ✅︎ | ✅︎ | | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
| `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc. | ✅︎ | ✅︎ | ✅︎ | | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.1V-9B-Thinkg`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Glm4MoeForCausalLM` | GLM-4.5 | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4MoeForCausalLM` | GLM-4.5 | T + I<sup>E+</sup> + V<sup>E+</sup> | `THUDM/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* | | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ | | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ | | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ | | `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ | | `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ |
| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ | | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ | | `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ | | `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ |
| `Mistral3ForConditionalGeneration` | Mistral3 | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ | | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ | | `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ | | `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ | | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ | | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ | | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
| `PixtralForConditionalGeneration` | Pixtral | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ | | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | | `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ | | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
| `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* | | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup> | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎ |
| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ |
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ | | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ |
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`,`omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ | | `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM. <sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models: &nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
...@@ -635,7 +657,7 @@ Specified using `--task generate`. ...@@ -635,7 +657,7 @@ Specified using `--task generate`.
For the best results, we recommend using the following dependency versions (tested on A10 and L40): For the best results, we recommend using the following dependency versions (tested on A10 and L40):
??? Dependency versions ??? code "Dependency versions"
```text ```text
# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40) # Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
...@@ -678,9 +700,9 @@ Specified using `--task transcription`. ...@@ -678,9 +700,9 @@ Specified using `--task transcription`.
Speech2Text models trained specifically for Automatic Speech Recognition. Speech2Text models trained specifically for Automatic Speech Recognition.
| Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|----------------------------------------------|------------------|------------------------------------------------------------------|------------------------|-----------------------------|-----------------------| |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
| `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | | | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | |
### Pooling Models ### Pooling Models
...@@ -701,13 +723,21 @@ Any text generation model can be converted into an embedding model by passing `- ...@@ -701,13 +723,21 @@ Any text generation model can be converted into an embedding model by passing `-
The following table lists those that are tested in vLLM. The following table lists those that are tested in vLLM.
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------| |--------------|--------|--------|-------------------|----------------------|---------------------------|---------------------|
| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | | | `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | |
| `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | | | `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | |
--- ---
#### Scoring
Specified using `--task score`.
| Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) |
|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|-----------------------|
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | | | ✅︎ |
## Model Support Policy ## Model Support Policy
At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
......
# Data Parallel Deployment
vLLM supports Data Parallel deployment, where model weights are replicated across separate instances/GPUs to process independent batches of requests.
This will work with both dense and MoE models.
For MoE models, particularly those like DeepSeek that employ MLA (Multi-head Latent Attention), it can be advantageous to use data parallel for the attention layers and expert or tensor parallel (EP or TP) for the expert layers.
In these cases, the data parallel ranks are not completely independent. Forward passes must be aligned, and expert layers across all ranks are required to synchronize during every forward pass, even when there are fewer requests to be processed than DP ranks.
The expert layers will by default form a (DP x TP) sized tensor parallel group. To enable expert parallelism, include the `--enable-expert-parallel` CLI arg (on all nodes in the multi-node case).
In vLLM, each DP rank is deployed as a separate "core engine" process that communicates with front-end process(es) via ZMQ sockets. Data Parallel attention can be combined with Tensor Parallel attention, in which case each DP engine owns a number of per-GPU worker processes equal to the configured TP size.
For MoE models, when any requests are in progress in any rank, we must ensure that empty "dummy" forward passes are performed in all ranks that don't currently have any requests scheduled. This is handled via a separate DP Coordinator process that communicates with all ranks, and a collective operation performed every N steps to determine when all ranks become idle and can be paused. When TP is used in conjunction with DP, expert layers form an EP or TP group of size (DP x TP).
In all cases, it is beneficial to load-balance requests between DP ranks. For online deployments, this balancing can be optimized by taking into account the state of each DP engine - in particular its currently scheduled and waiting (queued) requests, and KV cache state. Each DP engine has an independent KV cache, and the benefit of prefix caching can be maximized by directing prompts intelligently.
This document focuses on online deployments (with the API server). DP + EP is also supported for offline usage (via the LLM class), for an example see <gh-file:examples/offline_inference/data_parallel.py>.
There are two distinct modes supported for online deployments - self-contained with internal load balancing, or externally per-rank process deployment and load balancing.
## Internal Load Balancing
vLLM supports "self-contained" data parallel deployments that expose a single API endpoint.
It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs.
Running a single data parallel deployment across multiple nodes requires a different `vllm serve` to be run on each node, specifying which DP ranks should run on that node. In this case, there will still be a single HTTP entrypoint - the API server(s) will run only on one node, but it doesn't necessarily need to be co-located with the DP ranks.
This will run DP=4, TP=2 on a single 8-GPU node:
```bash
vllm serve $MODEL --data-parallel-size 4 --tensor-parallel-size 2
```
This will run DP=4 with DP ranks 0 and 1 on the head node and ranks 2 and 3 on the second node:
```bash
# Node 0 (with ip address 10.99.48.128)
vllm serve $MODEL --data-parallel-size 4 --data-parallel-size-local 2 \
--data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
# Node 1
vllm serve $MODEL --headless --data-parallel-size 4 --data-parallel-size-local 2 \
--data-parallel-start-rank 2 \
--data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
```
This will run DP=4 with only the API server on the first node and all engines on the second node:
```bash
# Node 0 (with ip address 10.99.48.128)
vllm serve $MODEL --data-parallel-size 4 --data-parallel-size-local 0 \
--data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
# Node 1
vllm serve $MODEL --headless --data-parallel-size 4 --data-parallel-size-local 4 \
--data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
```
This DP mode can also be used with Ray by specifying `--data-parallel-backend=ray`:
```bash
vllm serve $MODEL --data-parallel-size 4 --data-parallel-size-local 2 \
--data-parallel-backend=ray
```
There are several notable differences when using Ray:
- A single launch command (on any node) is needed to start all local and remote DP ranks, therefore it is more convenient compared to launching on each node
- There is no need to specify `--data-parallel-address`, and the node where the command is run is used as `--data-parallel-address`
- There is no need to specify `--data-parallel-rpc-port`
- Remote DP ranks will be allocated based on node resources of the Ray cluster
Currently, the internal DP load balancing is done within the API server process(es) and is based on the running and waiting queues in each of the engines. This could be made more sophisticated in future by incorporating KV cache aware logic.
When deploying large DP sizes using this method, the API server process can become a bottleneck. In this case, the orthogonal `--api-server-count` command line option can be used to scale this out (for example `--api-server-count=4`). This is transparent to users - a single HTTP endpoint / port is still exposed. Note that this API server scale-out is "internal" and still confined to the "head" node.
<figure markdown="1">
![DP Internal LB Diagram](../assets/deployment/dp_internal_lb.png)
</figure>
## External Load Balancing
For larger scale deployments especially, it can make sense to handle the orchestration and load balancing of data parallel ranks externally.
In this case, it's more convenient to treat each DP rank like a separate vLLM deployment, with its own endpoint, and have an external router balance HTTP requests between them, making use of appropriate real-time telemetry from each server for routing decisions.
This can already be done trivially for non-MoE models, since each deployed server is fully independent. No data parallel CLI options need to be used for this.
We support an equivalent topology for MoE DP+EP which can be configured via the following CLI arguments.
If DP ranks are co-located (same node / ip address), a default RPC port is used, but a different HTTP server port must be specified for each rank:
```bash
# Rank 0
CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL --data-parallel-size 2 --data-parallel-rank 0 \
--port 8000
# Rank 1
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL --data-parallel-size 2 --data-parallel-rank 1 \
--port 8001
```
For multi-node cases, the address/port of rank 0 must also be specified:
```bash
# Rank 0 (with ip address 10.99.48.128)
vllm serve $MODEL --data-parallel-size 2 --data-parallel-rank 0 \
--data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
# Rank 1
vllm serve $MODEL --data-parallel-size 2 --data-parallel-rank 1 \
--data-parallel-address 10.99.48.128 --data-parallel-rpc-port 13345
```
The coordinator process also runs in this scenario, co-located with the DP rank 0 engine.
<figure markdown="1">
![DP External LB Diagram](../assets/deployment/dp_external_lb.png)
</figure>
In the above diagram, each of the dotted boxes corresponds to a separate launch of `vllm serve` - these could be separate Kubernetes pods, for example.
--- # Distributed Inference and Serving
title: Distributed Inference and Serving
---
[](){ #distributed-serving }
## How to decide the distributed inference strategy? ## How to decide the distributed inference strategy?
...@@ -18,6 +15,10 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh ...@@ -18,6 +15,10 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh
!!! note !!! note
There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
### Distributed serving of MoE (Mixture of Experts) models
It is often advantageous to exploit the inherent parallelism of experts by using a separate parallelism strategy for the expert layers. vLLM supports large-scale deployment combining Data Parallel attention with Expert or Tensor Parallel MoE layers. See the page on [Data Parallel Deployment](data_parallel_deployment.md) for more information.
## Running vLLM on a single node ## Running vLLM on a single node
vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inference currently requires Ray. vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inference currently requires Ray.
......
--- # LangChain
title: LangChain
---
[](){ #serving-langchain }
vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) . vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
...@@ -13,7 +10,7 @@ pip install langchain langchain_community -q ...@@ -13,7 +10,7 @@ pip install langchain langchain_community -q
To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`.
??? Code ??? code
```python ```python
from langchain_community.llms import VLLM from langchain_community.llms import VLLM
......
--- # LlamaIndex
title: LlamaIndex
---
[](){ #serving-llamaindex }
vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) . vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
......
--- # Offline Inference
title: Offline Inference
---
[](){ #offline-inference }
You can run vLLM in your own code on a list of prompts. Offline inference is possible in your own code using vLLM's [`LLM`][vllm.LLM] class.
The offline API is based on the [LLM][vllm.LLM] class.
To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run.
For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace
and runs it in vLLM using the default configuration. and runs it in vLLM using the default configuration.
...@@ -14,16 +8,53 @@ and runs it in vLLM using the default configuration. ...@@ -14,16 +8,53 @@ and runs it in vLLM using the default configuration.
```python ```python
from vllm import LLM from vllm import LLM
# Initialize the vLLM engine.
llm = LLM(model="facebook/opt-125m") llm = LLM(model="facebook/opt-125m")
``` ```
After initializing the `LLM` instance, you can perform model inference using various APIs. After initializing the `LLM` instance, use the available APIs to perform model inference.
The available APIs depend on the type of model that is being run: The available APIs depend on the model type:
- [Generative models][generative-models] output logprobs which are sampled from to obtain the final output text.
- [Pooling models][pooling-models] output their hidden states directly.
Please refer to the above pages for more details about each API. - [Generative models](../models/generative_models.md) output logprobs which are sampled from to obtain the final output text.
- [Pooling models](../models/pooling_models.md) output their hidden states directly.
!!! info !!! info
[API Reference][offline-inference-api] [API Reference][offline-inference-api]
## Ray Data LLM API
Ray Data LLM is an alternative offline inference API that uses vLLM as the underlying engine.
This API adds several batteries-included capabilities that simplify large-scale, GPU-efficient inference:
- Streaming execution processes datasets that exceed aggregate cluster memory.
- Automatic sharding, load balancing, and autoscaling distribute work across a Ray cluster with built-in fault tolerance.
- Continuous batching keeps vLLM replicas saturated and maximizes GPU utilization.
- Transparent support for tensor and pipeline parallelism enables efficient multi-GPU inference.
- Reading and writing to most popular file formats and cloud object storage.
- Scaling up the workload without code changes.
??? code
```python
import ray # Requires ray>=2.44.1
from ray.data.llm import vLLMEngineProcessorConfig, build_llm_processor
config = vLLMEngineProcessorConfig(model_source="unsloth/Llama-3.2-1B-Instruct")
processor = build_llm_processor(
config,
preprocess=lambda row: {
"messages": [
{"role": "system", "content": "You are a bot that completes unfinished haikus."},
{"role": "user", "content": row["item"]},
],
"sampling_params": {"temperature": 0.3, "max_tokens": 250},
},
postprocess=lambda row: {"answer": row["generated_text"]},
)
ds = ray.data.from_items(["An old silent pond..."])
ds = processor(ds)
ds.write_parquet("local:///tmp/data/")
```
For more information about the Ray Data LLM API, see the [Ray Data LLM documentation](https://docs.ray.io/en/latest/data/working-with-llms.html).
--- # OpenAI-Compatible Server
title: OpenAI-Compatible Server
---
[](){ #serving-openai-compatible-server }
vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client. vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client.
In your terminal, you can [install](../getting_started/installation/README.md) vLLM, then start the server with the [`vllm serve`][serve-args] command. (You can also use our [Docker][deployment-docker] image.) In your terminal, you can [install](../getting_started/installation/README.md) vLLM, then start the server with the [`vllm serve`](../configuration/serve_args.md) command. (You can also use our [Docker](../deployment/docker.md) image.)
```bash ```bash
vllm serve NousResearch/Meta-Llama-3-8B-Instruct \ vllm serve NousResearch/Meta-Llama-3-8B-Instruct \
...@@ -15,7 +12,7 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct \ ...@@ -15,7 +12,7 @@ vllm serve NousResearch/Meta-Llama-3-8B-Instruct \
To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python). To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
??? Code ??? code
```python ```python
from openai import OpenAI from openai import OpenAI
...@@ -146,7 +143,7 @@ completion = client.chat.completions.create( ...@@ -146,7 +143,7 @@ completion = client.chat.completions.create(
Only `X-Request-Id` HTTP request header is supported for now. It can be enabled Only `X-Request-Id` HTTP request header is supported for now. It can be enabled
with `--enable-request-id-headers`. with `--enable-request-id-headers`.
??? Code ??? code
```python ```python
completion = client.chat.completions.create( completion = client.chat.completions.create(
...@@ -185,7 +182,7 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py> ...@@ -185,7 +182,7 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
The following [sampling parameters][sampling-params] are supported. The following [sampling parameters][sampling-params] are supported.
??? Code ??? code
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params" --8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params"
...@@ -193,7 +190,7 @@ The following [sampling parameters][sampling-params] are supported. ...@@ -193,7 +190,7 @@ The following [sampling parameters][sampling-params] are supported.
The following extra parameters are supported: The following extra parameters are supported:
??? Code ??? code
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params" --8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
...@@ -208,7 +205,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai ...@@ -208,7 +205,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
We support both [Vision](https://platform.openai.com/docs/guides/vision)- and We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
see our [Multimodal Inputs][multimodal-inputs] guide for more information. see our [Multimodal Inputs](../features/multimodal_inputs.md) guide for more information.
- *Note: `image_url.detail` parameter is not supported.* - *Note: `image_url.detail` parameter is not supported.*
Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py> Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
...@@ -217,7 +214,7 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py> ...@@ -217,7 +214,7 @@ Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
The following [sampling parameters][sampling-params] are supported. The following [sampling parameters][sampling-params] are supported.
??? Code ??? code
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params" --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params"
...@@ -225,7 +222,7 @@ The following [sampling parameters][sampling-params] are supported. ...@@ -225,7 +222,7 @@ The following [sampling parameters][sampling-params] are supported.
The following extra parameters are supported: The following extra parameters are supported:
??? Code ??? code
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params" --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
...@@ -268,7 +265,7 @@ and passing a list of `messages` in the request. Refer to the examples below for ...@@ -268,7 +265,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
??? Code ??? code
```python ```python
import requests import requests
...@@ -327,7 +324,7 @@ The following [pooling parameters][pooling-params] are supported. ...@@ -327,7 +324,7 @@ The following [pooling parameters][pooling-params] are supported.
The following extra parameters are supported by default: The following extra parameters are supported by default:
??? Code ??? code
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params" --8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
...@@ -335,7 +332,7 @@ The following extra parameters are supported by default: ...@@ -335,7 +332,7 @@ The following extra parameters are supported by default:
For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead: For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
??? Code ??? code
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params" --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
...@@ -354,11 +351,16 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai ...@@ -354,11 +351,16 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
Code example: <gh-file:examples/online_serving/openai_transcription_client.py> Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
<!-- TODO: api enforced limits + uploading audios --> <!-- TODO: api enforced limits + uploading audios -->
#### API Enforced Limits
Set the maximum audio file size (in MB) that VLLM will accept, via the
`VLLM_MAX_AUDIO_CLIP_FILESIZE_MB` environment variable. Default is 25 MB.
#### Extra Parameters #### Extra Parameters
The following [sampling parameters][sampling-params] are supported. The following [sampling parameters][sampling-params] are supported.
??? Code ??? code
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params" --8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
...@@ -366,7 +368,7 @@ The following [sampling parameters][sampling-params] are supported. ...@@ -366,7 +368,7 @@ The following [sampling parameters][sampling-params] are supported.
The following extra parameters are supported: The following extra parameters are supported:
??? Code ??? code
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params" --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
...@@ -446,9 +448,9 @@ curl -v "http://127.0.0.1:8000/classify" \ ...@@ -446,9 +448,9 @@ curl -v "http://127.0.0.1:8000/classify" \
}' }'
``` ```
??? Response ??? console "Response"
```bash ```json
{ {
"id": "classify-7c87cac407b749a6935d8c7ce2a8fba2", "id": "classify-7c87cac407b749a6935d8c7ce2a8fba2",
"object": "list", "object": "list",
...@@ -494,9 +496,9 @@ curl -v "http://127.0.0.1:8000/classify" \ ...@@ -494,9 +496,9 @@ curl -v "http://127.0.0.1:8000/classify" \
}' }'
``` ```
??? Response ??? console "Response"
```bash ```json
{ {
"id": "classify-9bf17f2847b046c7b2d5495f4b4f9682", "id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
"object": "list", "object": "list",
...@@ -540,7 +542,7 @@ The following extra parameters are supported: ...@@ -540,7 +542,7 @@ The following extra parameters are supported:
### Score API ### Score API
Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair. Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence or multimodal pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1. Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
...@@ -564,9 +566,9 @@ curl -X 'POST' \ ...@@ -564,9 +566,9 @@ curl -X 'POST' \
}' }'
``` ```
??? Response ??? console "Response"
```bash ```json
{ {
"id": "score-request-id", "id": "score-request-id",
"object": "list", "object": "list",
...@@ -589,7 +591,7 @@ You can pass a string to `text_1` and a list to `text_2`, forming multiple sente ...@@ -589,7 +591,7 @@ You can pass a string to `text_1` and a list to `text_2`, forming multiple sente
where each pair is built from `text_1` and a string in `text_2`. where each pair is built from `text_1` and a string in `text_2`.
The total number of pairs is `len(text_2)`. The total number of pairs is `len(text_2)`.
??? Request ??? console "Request"
```bash ```bash
curl -X 'POST' \ curl -X 'POST' \
...@@ -606,9 +608,9 @@ The total number of pairs is `len(text_2)`. ...@@ -606,9 +608,9 @@ The total number of pairs is `len(text_2)`.
}' }'
``` ```
??? Response ??? console "Response"
```bash ```json
{ {
"id": "score-request-id", "id": "score-request-id",
"object": "list", "object": "list",
...@@ -634,7 +636,7 @@ You can pass a list to both `text_1` and `text_2`, forming multiple sentence pai ...@@ -634,7 +636,7 @@ You can pass a list to both `text_1` and `text_2`, forming multiple sentence pai
where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`). where each pair is built from a string in `text_1` and the corresponding string in `text_2` (similar to `zip()`).
The total number of pairs is `len(text_2)`. The total number of pairs is `len(text_2)`.
??? Request ??? console "Request"
```bash ```bash
curl -X 'POST' \ curl -X 'POST' \
...@@ -655,9 +657,9 @@ The total number of pairs is `len(text_2)`. ...@@ -655,9 +657,9 @@ The total number of pairs is `len(text_2)`.
}' }'
``` ```
??? Response ??? console "Response"
```bash ```json
{ {
"id": "score-request-id", "id": "score-request-id",
"object": "list", "object": "list",
...@@ -679,6 +681,55 @@ The total number of pairs is `len(text_2)`. ...@@ -679,6 +681,55 @@ The total number of pairs is `len(text_2)`.
} }
``` ```
#### Multi-modal inputs
You can pass multi-modal inputs to scoring models by passing `content` including a list of multi-modal input (image, etc.) in the request. Refer to the examples below for illustration.
=== "JinaVL-Reranker"
To serve the model:
```bash
vllm serve jinaai/jina-reranker-m0
```
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
??? Code
```python
import requests
response = requests.post(
"http://localhost:8000/v1/score",
json={
"model": "jinaai/jina-reranker-m0",
"text_1": "slm markdown",
"text_2": {
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
},
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
},
]
}
},
)
response.raise_for_status()
response_json = response.json()
print("Scoring output:", response_json["data"][0]["score"])
print("Scoring output:", response_json["data"][1]["score"])
```
Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_multimodal.py>
#### Extra parameters #### Extra parameters
The following [pooling parameters][pooling-params] are supported. The following [pooling parameters][pooling-params] are supported.
...@@ -698,8 +749,7 @@ The following extra parameters are supported: ...@@ -698,8 +749,7 @@ The following extra parameters are supported:
### Re-rank API ### Re-rank API
Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences or multi-modal inputs (image, etc.), on a scale of 0 to 1.
a scale of 0 to 1.
You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
...@@ -716,7 +766,7 @@ Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py> ...@@ -716,7 +766,7 @@ Code example: <gh-file:examples/online_serving/jinaai_rerank_client.py>
Note that the `top_n` request parameter is optional and will default to the length of the `documents` field. Note that the `top_n` request parameter is optional and will default to the length of the `documents` field.
Result documents will be sorted by relevance, and the `index` property can be used to determine original order. Result documents will be sorted by relevance, and the `index` property can be used to determine original order.
??? Request ??? console "Request"
```bash ```bash
curl -X 'POST' \ curl -X 'POST' \
...@@ -734,9 +784,9 @@ Result documents will be sorted by relevance, and the `index` property can be us ...@@ -734,9 +784,9 @@ Result documents will be sorted by relevance, and the `index` property can be us
}' }'
``` ```
??? Response ??? console "Response"
```bash ```json
{ {
"id": "rerank-fae51b2b664d4ed38f5969b612edff77", "id": "rerank-fae51b2b664d4ed38f5969b612edff77",
"model": "BAAI/bge-reranker-base", "model": "BAAI/bge-reranker-base",
...@@ -775,3 +825,17 @@ The following extra parameters are supported: ...@@ -775,3 +825,17 @@ The following extra parameters are supported:
```python ```python
--8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params" --8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params"
``` ```
## Ray Serve LLM
Ray Serve LLM enables scalable, production-grade serving of the vLLM engine. It integrates tightly with vLLM and extends it with features such as auto-scaling, load balancing, and back-pressure.
Key capabilities:
- Exposes an OpenAI-compatible HTTP API as well as a Pythonic API.
- Scales from a single GPU to a multi-node cluster without code changes.
- Provides observability and autoscaling policies through Ray dashboards and metrics.
The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: <gh-file:examples/online_serving/ray_serve_deepseek.py>.
Learn more about Ray Serve LLM with the official [Ray Serve LLM documentation](https://docs.ray.io/en/latest/serve/llm/serving-llms.html).
--- # Frequently Asked Questions
title: Frequently Asked Questions
---
[](){ #faq }
> Q: How can I serve multiple models on a single port using the OpenAI API? > Q: How can I serve multiple models on a single port using the OpenAI API?
...@@ -12,7 +9,7 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul ...@@ -12,7 +9,7 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul
> Q: Which model to use for offline inference embedding? > Q: Which model to use for offline inference embedding?
A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5); A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5);
more are listed [here][supported-models]. more are listed [here](../models/supported_models.md).
By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
[Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models, [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
......
...@@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the ...@@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the
system. These metrics are exposed via the `/metrics` endpoint on the vLLM system. These metrics are exposed via the `/metrics` endpoint on the vLLM
OpenAI compatible API server. OpenAI compatible API server.
You can start the server using Python, or using [Docker][deployment-docker]: You can start the server using Python, or using [Docker](../deployment/docker.md):
```bash ```bash
vllm serve unsloth/Llama-3.2-1B-Instruct vllm serve unsloth/Llama-3.2-1B-Instruct
...@@ -12,7 +12,7 @@ vllm serve unsloth/Llama-3.2-1B-Instruct ...@@ -12,7 +12,7 @@ vllm serve unsloth/Llama-3.2-1B-Instruct
Then query the endpoint to get the latest metrics from the server: Then query the endpoint to get the latest metrics from the server:
??? Output ??? console "Output"
```console ```console
$ curl http://0.0.0.0:8000/metrics $ curl http://0.0.0.0:8000/metrics
...@@ -33,7 +33,7 @@ Then query the endpoint to get the latest metrics from the server: ...@@ -33,7 +33,7 @@ Then query the endpoint to get the latest metrics from the server:
The following metrics are exposed: The following metrics are exposed:
??? Code ??? code
```python ```python
--8<-- "vllm/engine/metrics.py:metrics-definitions" --8<-- "vllm/engine/metrics.py:metrics-definitions"
......
--- # Troubleshooting
title: Troubleshooting
---
[](){ #troubleshooting }
This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
...@@ -60,7 +57,7 @@ To identify the particular CUDA operation that causes the error, you can add `-- ...@@ -60,7 +57,7 @@ To identify the particular CUDA operation that causes the error, you can add `--
If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
??? Code ??? code
```python ```python
# Test PyTorch NCCL # Test PyTorch NCCL
...@@ -170,7 +167,7 @@ WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously ...@@ -170,7 +167,7 @@ WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
or an error from Python that looks like this: or an error from Python that looks like this:
??? Logs ??? console "Logs"
```console ```console
RuntimeError: RuntimeError:
...@@ -212,9 +209,9 @@ if __name__ == '__main__': ...@@ -212,9 +209,9 @@ if __name__ == '__main__':
## `torch.compile` Error ## `torch.compile` Error
vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script: vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](gh-pr:10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script:
??? Code ??? code
```python ```python
import torch import torch
...@@ -231,7 +228,7 @@ vLLM heavily depends on `torch.compile` to optimize the model for better perform ...@@ -231,7 +228,7 @@ vLLM heavily depends on `torch.compile` to optimize the model for better perform
print(f(x)) print(f(x))
``` ```
If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example. If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See <gh-issue:12219> for example.
## Model failed to be inspected ## Model failed to be inspected
...@@ -267,7 +264,7 @@ or: ...@@ -267,7 +264,7 @@ or:
ValueError: Model architectures ['<arch>'] are not supported for now. Supported architectures: [...] ValueError: Model architectures ['<arch>'] are not supported for now. Supported architectures: [...]
``` ```
But you are sure that the model is in the [list of supported models][supported-models], there may be some issue with vLLM's model resolution. In that case, please follow [these steps](../configuration/model_resolution.md) to explicitly specify the vLLM implementation for the model. But you are sure that the model is in the [list of supported models](../models/supported_models.md), there may be some issue with vLLM's model resolution. In that case, please follow [these steps](../configuration/model_resolution.md) to explicitly specify the vLLM implementation for the model.
## Failed to infer device type ## Failed to infer device type
......
...@@ -10,7 +10,7 @@ The list of data collected by the latest version of vLLM can be found here: <gh- ...@@ -10,7 +10,7 @@ The list of data collected by the latest version of vLLM can be found here: <gh-
Here is an example as of v0.4.0: Here is an example as of v0.4.0:
??? Output ??? console "Output"
```json ```json
{ {
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
!!! announcement !!! announcement
We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details. We have started the process of deprecating V0. Please read [RFC #18571](gh-issue:18571) for more details.
V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack). V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
...@@ -83,14 +83,14 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the ...@@ -83,14 +83,14 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
| **Decoder-only Models** | <nobr>🚀 Optimized</nobr> | | **Decoder-only Models** | <nobr>🚀 Optimized</nobr> |
| **Encoder-Decoder Models** | <nobr>🟠 Delayed</nobr> | | **Encoder-Decoder Models** | <nobr>🟠 Delayed</nobr> |
| **Embedding Models** | <nobr>🟢 Functional</nobr> | | **Embedding Models** | <nobr>🟢 Functional</nobr> |
| **Mamba Models** | <nobr>🚧 WIP ([PR #19327](https://github.com/vllm-project/vllm/pull/19327))</nobr> | | **Mamba Models** | <nobr>🟢 (Mamba-2), 🟡 (Mamba-1)</nobr> |
| **Multimodal Models** | <nobr>🟢 Functional</nobr> | | **Multimodal Models** | <nobr>🟢 Functional</nobr> |
vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol. vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol.
!!! tip !!! tip
This corresponds to the V1 column in our [list of supported models][supported-models]. This corresponds to the V1 column in our [list of supported models](../models/supported_models.md).
See below for the status of models that are not yet supported or have more features planned in V1. See below for the status of models that are not yet supported or have more features planned in V1.
...@@ -98,14 +98,20 @@ See below for the status of models that are not yet supported or have more featu ...@@ -98,14 +98,20 @@ See below for the status of models that are not yet supported or have more featu
The initial basic support is now functional. The initial basic support is now functional.
Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), Later, we will consider using [hidden states processor](gh-issue:12249),
which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) which is based on [global logits processor](gh-pr:13360)
to enable simultaneous generation and embedding using the same engine instance in V1. to enable simultaneous generation and embedding using the same engine instance in V1.
#### Mamba Models #### Mamba Models
Models using selective state-space mechanisms instead of standard transformer attention (e.g., `MambaForCausalLM`, `JambaForCausalLM`) Models using selective state-space mechanisms instead of standard transformer attention are partially supported.
will be supported via [PR #19327](https://github.com/vllm-project/vllm/pull/19327). Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers
(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet supported. Please note that these models currently require
disabling prefix caching in V1.
Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`). Please note that
these models currently require disabling prefix caching and using the FlashInfer attention backend in V1.
#### Encoder-Decoder Models #### Encoder-Decoder Models
...@@ -120,13 +126,13 @@ are not yet supported. ...@@ -120,13 +126,13 @@ are not yet supported.
| **Chunked Prefill** | <nobr>🚀 Optimized</nobr> | | **Chunked Prefill** | <nobr>🚀 Optimized</nobr> |
| **LoRA** | <nobr>🚀 Optimized</nobr> | | **LoRA** | <nobr>🚀 Optimized</nobr> |
| **Logprobs Calculation** | <nobr>🟢 Functional</nobr> | | **Logprobs Calculation** | <nobr>🟢 Functional</nobr> |
| **FP8 KV Cache** | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>| | **FP8 KV Cache** | <nobr>🟢 Functional on Hopper devices (<gh-pr:15191>)</nobr>|
| **Spec Decode** | <nobr>🚀 Optimized</nobr> | | **Spec Decode** | <nobr>🚀 Optimized</nobr> |
| **Prompt Logprobs with Prefix Caching** | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>| | **Prompt Logprobs with Prefix Caching** | <nobr>🟡 Planned ([RFC #13414](gh-issue:13414))</nobr>|
| **Structured Output Alternative Backends** | <nobr>🟢 Functional</nobr> | | **Structured Output Alternative Backends** | <nobr>🟢 Functional</nobr> |
| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr> | | **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr> |
| **best_of** | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>| | **best_of** | <nobr>🔴 Deprecated ([RFC #13361](gh-issue:13361))</nobr>|
| **Per-Request Logits Processors** | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> | | **Per-Request Logits Processors** | <nobr>🔴 Deprecated ([RFC #13360](gh-pr:13360))</nobr> |
| **GPU <> CPU KV Cache Swapping** | <nobr>🔴 Deprecated</nobr> | | **GPU <> CPU KV Cache Swapping** | <nobr>🔴 Deprecated</nobr> |
!!! note !!! note
...@@ -153,7 +159,7 @@ Support for logprobs with post-sampling adjustments is in progress and will be a ...@@ -153,7 +159,7 @@ Support for logprobs with post-sampling adjustments is in progress and will be a
**Prompt Logprobs with Prefix Caching** **Prompt Logprobs with Prefix Caching**
Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](https://github.com/vllm-project/vllm/issues/13414). Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](gh-issue:13414).
#### Deprecated Features #### Deprecated Features
...@@ -161,11 +167,11 @@ As part of the major architectural rework in vLLM V1, several legacy features ha ...@@ -161,11 +167,11 @@ As part of the major architectural rework in vLLM V1, several legacy features ha
**Sampling features** **Sampling features**
- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361). - **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](gh-issue:13361).
- **Per-Request Logits Processors**: In V0, users could pass custom - **Per-Request Logits Processors**: In V0, users could pass custom
processing functions to adjust logits on a per-request basis. In vLLM V1, this processing functions to adjust logits on a per-request basis. In vLLM V1, this
feature has been deprecated. Instead, the design is moving toward supporting **global logits feature has been deprecated. Instead, the design is moving toward supporting **global logits
processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360). processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](gh-pr:13360).
**KV Cache features** **KV Cache features**
......
...@@ -10,7 +10,7 @@ on HuggingFace model repository. ...@@ -10,7 +10,7 @@ on HuggingFace model repository.
import os import os
from dataclasses import asdict from dataclasses import asdict
from typing import NamedTuple, Optional from typing import Any, NamedTuple, Optional
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from transformers import AutoTokenizer from transformers import AutoTokenizer
...@@ -30,7 +30,9 @@ question_per_audio_count = { ...@@ -30,7 +30,9 @@ question_per_audio_count = {
class ModelRequestData(NamedTuple): class ModelRequestData(NamedTuple):
engine_args: EngineArgs engine_args: EngineArgs
prompt: str prompt: Optional[str] = None
prompt_token_ids: Optional[dict[str, list[int]]] = None
multi_modal_data: Optional[dict[str, Any]] = None
stop_token_ids: Optional[list[int]] = None stop_token_ids: Optional[list[int]] = None
lora_requests: Optional[list[LoRARequest]] = None lora_requests: Optional[list[LoRARequest]] = None
...@@ -40,6 +42,60 @@ class ModelRequestData(NamedTuple): ...@@ -40,6 +42,60 @@ class ModelRequestData(NamedTuple):
# Unless specified, these settings have been tested to work on a single L4. # Unless specified, these settings have been tested to work on a single L4.
# Voxtral
def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
from mistral_common.audio import Audio
from mistral_common.protocol.instruct.messages import (
AudioChunk,
RawAudio,
TextChunk,
UserMessage,
)
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
model_name = "mistralai/Voxtral-Mini-3B-2507"
tokenizer = MistralTokenizer.from_hf_hub(model_name)
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={"audio": audio_count},
config_format="mistral",
load_format="mistral",
tokenizer_mode="mistral",
enforce_eager=True,
enable_chunked_prefill=False,
)
text_chunk = TextChunk(text=question)
audios = [
Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
for i in range(audio_count)
]
audio_chunks = [
AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
]
messages = [UserMessage(content=[*audio_chunks, text_chunk])]
req = ChatCompletionRequest(messages=messages, model=model_name)
tokens = tokenizer.encode_chat_completion(req)
prompt_ids, audios = tokens.tokens, tokens.audios
audios_and_sr = [(au.audio_array, au.sampling_rate) for au in audios]
multi_modal_data = {"audio": audios_and_sr}
return ModelRequestData(
engine_args=engine_args,
prompt_token_ids=prompt_ids,
multi_modal_data=multi_modal_data,
)
# Granite Speech # Granite Speech
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData: def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
# NOTE - the setting in this example are somehat different than what is # NOTE - the setting in this example are somehat different than what is
...@@ -243,6 +299,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData: ...@@ -243,6 +299,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
model_example_map = { model_example_map = {
"voxtral": run_voxtral,
"granite_speech": run_granite_speech, "granite_speech": run_granite_speech,
"minicpmo": run_minicpmo, "minicpmo": run_minicpmo,
"phi4_mm": run_phi4mm, "phi4_mm": run_phi4mm,
...@@ -311,16 +368,24 @@ def main(args): ...@@ -311,16 +368,24 @@ def main(args):
temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
) )
mm_data = {} mm_data = req_data.multi_modal_data
if audio_count > 0: if not mm_data:
mm_data = { mm_data = {}
"audio": [ if audio_count > 0:
asset.audio_and_sample_rate for asset in audio_assets[:audio_count] mm_data = {
] "audio": [
} asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
]
}
assert args.num_prompts > 0 assert args.num_prompts > 0
inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data} inputs = {"multi_modal_data": mm_data}
if req_data.prompt:
inputs["prompt"] = req_data.prompt
else:
inputs["prompt_token_ids"] = req_data.prompt_token_ids
if args.num_prompts > 1: if args.num_prompts > 1:
# Batch inference # Batch inference
inputs = [inputs] * args.num_prompts inputs = [inputs] * args.num_prompts
......
...@@ -28,10 +28,10 @@ def main(args: Namespace): ...@@ -28,10 +28,10 @@ def main(args: Namespace):
# Create an LLM. # Create an LLM.
# You should pass task="classify" for classification models # You should pass task="classify" for classification models
model = LLM(**vars(args)) llm = LLM(**vars(args))
# Generate logits. The output is a list of ClassificationRequestOutputs. # Generate logits. The output is a list of ClassificationRequestOutputs.
outputs = model.classify(prompts) outputs = llm.classify(prompts)
# Print the outputs. # Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60) print("\nGenerated Outputs:\n" + "-" * 60)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment