Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -640,6 +640,10 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "Tensor block_mapping) -> ()");
  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
+  cache_ops.def(
+      "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
+  cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
  // Reshape the key and value tensors and cache them.
  cache_ops.def(
      "reshape_and_cache(Tensor key, Tensor value,"

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
+# SPDX-License-Identifier: Apache-2.0
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
@@ -35,7 +37,6 @@ author = 'the vLLM Team'
 # ones.
 extensions = [
    "sphinx.ext.napoleon",
-    "sphinx.ext.viewcode",
    "sphinx.ext.linkcode",
    "sphinx.ext.intersphinx",
    "sphinx_copybutton",

--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -250,7 +250,11 @@ def get_max_image_tokens(self) -> int:
 And thus, we can override the method as:
 ```python
-def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+def get_mm_max_tokens_per_item(
+    self,
+    seq_len: int,
+    mm_counts: Mapping[str, int],
+) -> Mapping[str, int]:
    return {"image": self.get_max_image_tokens()}
 ```

--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -2,12 +2,6 @@
 # AutoAWQ
-:::{warning}
-Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
-accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
-inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
-:::
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
 The main benefits are lower latency and memory usage.

--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -131,7 +131,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
    tensor_parallel_size=4,
-    speculative_model="ibm-fms/llama3-70b-accelerator",
+    speculative_model="ibm-ai-platform/llama3-70b-accelerator",
    speculative_draft_tensor_parallel_size=1,
 )
 outputs = llm.generate(prompts, sampling_params)
@@ -149,11 +149,11 @@ limitation will be fixed in a future release.
 A variety of speculative models of this type are available on HF hub:
- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator)
+- [llama-13b-accelerator](https://huggingface.co/ibm-ai-platform/llama-13b-accelerator)
- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator)
+- [llama3-8b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-8b-accelerator)
- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator)
+- [codellama-34b-accelerator](https://huggingface.co/ibm-ai-platform/codellama-34b-accelerator)
- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator)
+- [llama2-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama2-70b-accelerator)
- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator)
+- [llama3-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-70b-accelerator)
 - [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator)
 - [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator)
 - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)

--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
+# SPDX-License-Identifier: Apache-2.0
 import itertools
 import re
 from dataclasses import dataclass, field

--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -36,7 +36,7 @@ VLLM_TARGET_DEVICE=xpu python setup.py install
 :::{note}
 - FP16 is the default data type in the current XPU backend. The BF16 data
-  type will be supported in the future.
+  type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
 :::
 ## Set up using Docker

--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -40,6 +40,82 @@ If vLLM successfully returns text (for generative models) or hidden states (for
 Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
+### Transformers fallback
+After the merge of <gh-pr:11330>, `vllm` can fallback to models that are available in `transformers`. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
+To check if the backend is `transformers`, you can simply do this:
+```python 
+from vllm import LLM
+llm = LLM(model=..., task="generate")  # Name or path of your model
+llm.apply_model(lambda model: print(model.__class__))
+```
+If it is `TransformersModel` then it means it's based on `transformers`!
+#### Supported features
+##### LORA and quantization
+Both are not supported yet! Make sure to open an issue and we'll work on this together with the `transformers` team!
+Usually `transformers` model load weights via the `load_adapters` API, that depends on PEFT. We need to work a bit to either use this api (for now this would result in some weights not being marked as loaded) or replace modules accordingly.
+Hints as to how this would look like:
+```python
+class TransformersModel(nn.Module, SupportsLoRA):
+  def __init__(*):
+    ...
+    self.model.load_adapter(vllm_config.load_config.model_loader_extra_config["qlora_adapter_name_or_path"])
+```
+Blocker is that you need to specify supported lora layers, when we would ideally want to load whatever is inside the checkpoint!
+##### Remote code
+This fallback also means that any model on the hub that can be used in `transformers` with `trust_remote_code=True` that correctly implements attention can be used in production!
+```python 
+from vllm import LLM
+llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of your model
+llm.apply_model(lambda model: print(model.__class__))
+```
+A model just needs the following two things:
+```python
+from transformers import PreTrainedModel
+from torch import nn
+class MyAttention(nn.Module):
+  def forward(self, hidden_states, **kwargs): # <- kwargs are required
+    ...
+    attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+    attn_output, attn_weights = attention_interface(
+      self,
+      query_states,
+      key_states,
+      value_states,
+      **kwargs,
+    )
+    ...
+class MyModel(PreTrainedModel):
+  _supports_attention_backend = True
+```
+Here is what happens in the background:
+1. The config is loaded
+2. `MyModel` python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
+3. The `TransformersModel` backend is used. See `/model_executors/models/transformers`, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
+That's it!
 ### ModelScope
 To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
@@ -650,14 +726,14 @@ See [this page](#generative-models) for more information on how to use generativ
  * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
  *
  * ✅︎
-  *
+  * \*
 - * `Idefics3ForConditionalGeneration`
  * Idefics3
  * T + I
  * `HuggingFaceM4/Idefics3-8B-Llama3` etc.
  * ✅︎
  *
-  *
+  * ✅︎
 - * `InternVLChatModel`
  * InternVL 2.5, Mono-InternVL, InternVL 2.0
  * T + I<sup>E+</sup>
@@ -723,7 +799,7 @@ See [this page](#generative-models) for more information on how to use generativ
  * ✅︎
 - * `NVLM_D_Model`
  * NVLM-D 1.0
-  * T + I<sup>E+</sup>
+  * T + I<sup>+</sup>
  * `nvidia/NVLM-D-72B`, etc.
  *
  * ✅︎
@@ -770,11 +846,18 @@ See [this page](#generative-models) for more information on how to use generativ
  * ✅︎
  * ✅︎
  * ✅︎
+- * `Qwen2_5_VLForConditionalGeneration`
+  * Qwen2.5-VL
+  * T + I<sup>E+</sup> + V<sup>E+</sup>
+  * `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `UltravoxModel`
  * Ultravox
  * T + A<sup>E+</sup>
  * `fixie-ai/ultravox-v0_3`
-  *
+  * ✅︎
  * ✅︎
  * ✅︎
 :::
@@ -783,7 +866,11 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 :::{note}
-To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
+To use DeepSeek-VL2 series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
+:::
+:::{note}
+H2O-VL series models will be available in V1 once we support backends other than FlashAttention.
 :::
 :::{note}
@@ -796,8 +883,11 @@ For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 :::
 :::{note}
-The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)).
+`mistral-community/pixtral-12b` does not support V1 yet.
-A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
+:::
+:::{note}
+To use Qwen2.5-VL series models, you have to install Huggingface `transformers` library from source via `pip install git+https://github.com/huggingface/transformers`.
 :::
 ### Pooling Models

--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -60,7 +60,8 @@ bash run_cluster.sh \
                vllm/vllm-openai \
                ip_of_head_node \
                --head \
-                /path/to/the/huggingface/home/in/this/node
+                /path/to/the/huggingface/home/in/this/node \
+                -e VLLM_HOST_IP=ip_of_this_node
 ```
 On the rest of the worker nodes, run the following command:
@@ -70,10 +71,11 @@ bash run_cluster.sh \
                vllm/vllm-openai \
                ip_of_head_node \
                --worker \
-                /path/to/the/huggingface/home/in/this/node
+                /path/to/the/huggingface/home/in/this/node \
+                -e VLLM_HOST_IP=ip_of_this_node
 ```
-Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
+Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses.
 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
@@ -103,3 +105,7 @@ Please make sure you downloaded the model to all the nodes (with the same path),
 When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
 :::
+:::{warning}
+If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` to see the IP address used by Ray. See <gh-issue:7815> for more information.
+:::
--- a/examples/offline_inference/aqlm_example.py
+++ b/examples/offline_inference/aqlm_example.py
+# SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams
 from vllm.utils import FlexibleArgumentParser

--- a/examples/offline_inference/arctic.py
+++ b/examples/offline_inference/arctic.py
+# SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams
 # Sample prompts.

--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference 
 with the correct prompt format on audio language models.

--- a/examples/offline_inference/basic.py
+++ b/examples/offline_inference/basic.py
+# SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams
 if __name__ == '__main__':

--- a/examples/offline_inference/basic_with_model_default_sampling.py
+++ b/examples/offline_inference/basic_with_model_default_sampling.py
+# SPDX-License-Identifier: Apache-2.0
 from vllm import LLM
 # Sample prompts.

--- a/examples/offline_inference/chat.py
+++ b/examples/offline_inference/chat.py
+# SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams
 llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")

--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
+# SPDX-License-Identifier: Apache-2.0
 # ruff: noqa
 import json
 import random

--- a/examples/offline_inference/classification.py
+++ b/examples/offline_inference/classification.py
+# SPDX-License-Identifier: Apache-2.0
 from vllm import LLM
 # Sample prompts.

--- a/examples/offline_inference/cli.py
+++ b/examples/offline_inference/cli.py
+# SPDX-License-Identifier: Apache-2.0
 from dataclasses import asdict
 from vllm import LLM, SamplingParams

--- a/examples/offline_inference/cpu_offload.py
+++ b/examples/offline_inference/cpu_offload.py
+# SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams
 # Sample prompts.

--- a/examples/offline_inference/distributed.py
+++ b/examples/offline_inference/distributed.py
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use Ray Data for running offline batch inference
 distributively on a multi-nodes cluster.