Merge tag 'v0.15.1' into v0.15.1-dev

45a060d6 · zhuwenwen · 99fc9fc3 · 1892993b · 45a060d6 · 45a060d6
Commit 45a060d6 authored Feb 05, 2026 by zhuwenwen
20 changed files
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -274,14 +274,14 @@ steps:
          - input-release-version
          - build-wheels
-      - label: "Upload release wheels to PyPI and GitHub"
+      - label: "Upload release wheels to PyPI"
        depends_on:
          - block-upload-release-wheels
        id: upload-release-wheels
        agents:
          queue: small_cpu_queue_postmerge
        commands:
-          - "bash .buildkite/scripts/upload-release-wheels.sh"
+          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
  # =============================================================================
  # ROCm Release Pipeline (x86_64 only)

--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -11,58 +11,80 @@ fi
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel (by commit):
 \`\`\`
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+(Optional) For CUDA 13.0:
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
-\`\`\`
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
-To download the wheel (by version):
+(Optional) For CPU:
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
 \`\`\`
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
-\`\`\`
 To download and upload the image:
 \`\`\`
+Download images:
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
+Tag and push images:
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker push vllm/vllm-openai:latest-x86_64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+docker push vllm/vllm-openai:latest-x86_64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker push vllm/vllm-openai:latest-aarch64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
+Create multi-arch manifest:
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker push vllm/vllm-openai-rocm:latest-base
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-docker push vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
+docker manifest rm vllm/vllm-openai:latest-cu130
+docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker manifest push vllm/vllm-openai:latest-cu130
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
 \`\`\`
 EOF 
--- a/.buildkite/scripts/upload-release-wheels.sh
+++ b/.buildkite/scripts/upload-release-wheels.sh
@@ -7,17 +7,19 @@ SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
 RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
-echo "Release version from Buildkite: $RELEASE_VERSION"
 GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
-if [ -z "$GIT_VERSION" ]; then
+echo "Release version from Buildkite: $RELEASE_VERSION"
+if [[ -z "$GIT_VERSION" ]]; then
    echo "[FATAL] Not on a git tag, cannot create release."
    exit 1
 else
    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
 fi
 # sanity check for version mismatch
-if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
+if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
-  if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
+  if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
    echo "[WARNING] Force release and ignore version mismatch"
  else
    echo "[FATAL] Release version from Buildkite does not match Git version."
@@ -27,7 +29,7 @@ fi
 PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
 # check pypi token
-if [ -z "$PYPI_TOKEN" ]; then
+if [[ -z "$PYPI_TOKEN" ]]; then
  echo "[FATAL] PYPI_TOKEN is not set."
  exit 1
 else
@@ -35,41 +37,8 @@ else
  export TWINE_PASSWORD="$PYPI_TOKEN"
 fi
-# check github token
-if [ -z "$GITHUB_TOKEN" ]; then
-  echo "[FATAL] GITHUB_TOKEN is not set."
-  exit 1
-else
-  export GH_TOKEN="$GITHUB_TOKEN"
-fi
 set -x # avoid printing secrets above
-# download gh CLI from github
-# Get latest gh CLI version from GitHub API
-GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
-if [ -z "$GH_VERSION" ]; then
-  echo "[FATAL] Failed to get latest gh CLI version from GitHub"
-  exit 1
-fi
-echo "Downloading gh CLI version: $GH_VERSION"
-GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
-GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
-GH_INSTALL_DIR="/tmp/gh-install"
-mkdir -p "$GH_INSTALL_DIR"
-pushd "$GH_INSTALL_DIR"
-curl -L -o "$GH_TARBALL" "$GH_URL"
-tar -xzf "$GH_TARBALL"
-GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
-if [ -z "$GH_BIN" ]; then
-  echo "[FATAL] Failed to find gh CLI executable"
-  exit 1
-fi
-echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
-echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
-command "$GH_BIN" release list --limit 5
-popd
 # install twine from pypi
 python3 -m venv /tmp/vllm-release-env
 source /tmp/vllm-release-env/bin/activate
@@ -89,16 +58,13 @@ echo "Wheels copied to local directory"
 git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
 ls -la $DIST_DIR
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
-if [ -z "$PYPI_WHEEL_FILES" ]; then
+if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  echo "No default variant wheels found, quitting..."
  exit 1
 fi
 python3 -m twine check $PYPI_WHEEL_FILES
-python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
+python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
 echo "Wheels uploaded to PyPI"
-# create release on GitHub with the release version and all wheels
-command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
--- a/README.md
+++ b/README.md
@@ -2,10 +2,6 @@
 ## 简介
 vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention高效管理kv内存,Continuous batching传入请求,支持很多Hugging Face模型,如LLaMA & LLaMA-2、Qwen、Chatglm2 & Chatglm3等。
-## 暂不支持的官方功能
- **量化推理**:目前不支持marlin的权重量化
- **模块支持**:目前不支持Sliding window attention
 ## 支持模型结构列表
@@ -51,29 +47,23 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
 | XLMRobertaForSequenceClassification | bge-reranker-v2-m3                            | Yes    | No  | -   | v0.7.2       | No  |
-## 安装
-vLLM支持
-+ Python 3.9.
-+ Python 3.10.
-+ Python 3.11.
-+ Python 3.12.
-### 使用源码编译方式安装
+## 使用源码编译方式安装
-#### 编译环境准备
+### 编译环境准备
 提供2种环境准备方式:
-1. 基于光源pytorch2.9.01基础镜像环境:根据pytorch2.9.0、python、dtk及系统下载对应的镜像版本。
+1. 基于光源pytorch2.9.0基础镜像环境:根据pytorch2.9.0、python、dtk及系统下载对应的镜像版本。
-2. 基于现有python环境:安装pytorch2.9.01,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch),根据python、dtk版本,下载对应pytorch2.5.1的whl包。安装命令如下:
+2. 基于现有python环境:安装pytorch2.9.0,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch),根据python、dtk版本,下载对应pytorch2.5.1的whl包。安装命令如下:
 ```shell
 pip install torch* (下载的torch的whl包)
 pip install setuptools wheel
 ```
-#### 源码编译安装
+### 源码编译安装
 ```shell
-git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的分支进行切换
+git clone http://10.16.6.30/dcutoolkit/deeplearing/vllm.git # 根据需要的分支进行切换
 ```
 安装依赖:
 ```shell
@@ -91,10 +81,8 @@ python3 setup.py install （若调试，可使用python3 setup.py develop）
 ```
 若需要添加git号，设置环境变量: export ADD_GIT_VERSION=1
-3.跳过编译（适用于未改变csrc目录kernel并多次编译情况）
-将编译后的so文件拷贝至csrc目录，并设置环境变量: export SKIP_VLLM_BUILD=1
-#### 运行基础环境准备
+### 运行基础环境准备
 1、使用上面基于光源pytorch2.9.0基础镜像环境
 2、根据pytorch2.9.0、python、dtk及系统下载对应的依赖包:
@@ -104,11 +92,11 @@ python3 setup.py install （若调试，可使用python3 setup.py develop）
 - lightop: [https://cancon.hpccube.com:65024/4/main/lightop](https://cancon.hpccube.com:65024/4/main/lightop)
 - lmslim: [https://cancon.hpccube.com:65024/4/main/lmslim](https://cancon.hpccube.com:65024/4/main/lmslim)
-#### 注意事项
+### 注意事项
 + 若使用 pip install 下载安装过慢,可添加源: -i https://pypi.tuna.tsinghua.edu.cn/simple/
 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.15.0;
+- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.15.1;
 ## Known Issue
 - 无

--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -456,6 +456,7 @@ th {
 | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |
 | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ |
 | `Step1ForCausalLM` | Step-Audio | `stepfun-ai/Step-Audio-EditX`, etc. | ✅︎ | ✅︎ |
+| `Step3p5ForCausalLM` | Step-3.5-flash | `stepfun-ai/step-3.5-flash`, etc. |  | ✅︎ |
 | `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
 | `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ |
 | `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ |

--- a/examples/pooling/score/vision_rerank_api_online.py
+++ b/examples/pooling/score/vision_rerank_api_online.py
@@ -18,48 +18,32 @@ e.g.
 """
 import argparse
-import base64
+import pprint
-import json
 import requests
+from vllm.multimodal.utils import encode_image_url, fetch_image
-def encode_base64_content_from_url(content_url: str) -> dict[str, str]:
-    """Encode a content retrieved from a remote url to base64 format."""
-    with requests.get(content_url, headers=headers) as response:
-        response.raise_for_status()
-        result = base64.b64encode(response.content).decode("utf-8")
-    return {"url": f"data:image/jpeg;base64,{result}"}
-headers = {"accept": "application/json", "Content-Type": "application/json"}
 query = "A woman playing with her dog on a beach at sunset."
-documents = {
+document = (
-    "content": [
+    "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
-        {
+    "as the dog offers its paw in a heartwarming display of companionship and trust."
-            "type": "text",
+)
-            "text": (
+image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-                "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
+documents = [
-                "as the dog offers its paw in a heartwarming display of companionship and trust."
+    {
-            ),
+        "type": "text",
-        },
+        "text": document,
-        {
+    },
-            "type": "image_url",
+    {
-            "image_url": {
+        "type": "image_url",
-                "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+        "image_url": {"url": image_url},
-            },
+    },
-        },
+    {
-        {
+        "type": "image_url",
-            "type": "image_url",
+        "image_url": {"url": encode_image_url(fetch_image(image_url))},
-            "image_url": encode_base64_content_from_url(
+    },
-                "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+]
-            ),
-        },
-    ]
-}
 def parse_args():
@@ -74,23 +58,36 @@ def main(args):
    models_url = base_url + "/v1/models"
    rerank_url = base_url + "/rerank"
-    response = requests.get(models_url, headers=headers)
+    response = requests.get(models_url)
    model = response.json()["data"][0]["id"]
-    data = {
+    print("Query: string & Document: list of string")
+    prompt = {"model": model, "query": query, "documents": [document]}
+    response = requests.post(rerank_url, json=prompt)
+    pprint.pprint(response.json())
+    print("Query: string & Document: text")
+    prompt = {"model": model, "query": query, "documents": {"content": [documents[0]]}}
+    response = requests.post(rerank_url, json=prompt)
+    pprint.pprint(response.json())
+    print("Query: string & Document: image url")
+    prompt = {
+        "model": model,
+        "query": query,
+        "documents": {"content": [documents[1]]},
+    }
+    response = requests.post(rerank_url, json=prompt)
+    pprint.pprint(response.json())
+    print("Query: string & Document: image base64")
+    prompt = {
        "model": model,
        "query": query,
-        "documents": documents,
+        "documents": {"content": [documents[2]]},
    }
-    response = requests.post(rerank_url, headers=headers, json=data)
+    response = requests.post(rerank_url, json=prompt)
+    pprint.pprint(response.json())
-    # Check the response
-    if response.status_code == 200:
-        print("Request successful!")
-        print(json.dumps(response.json(), indent=2))
-    else:
-        print(f"Request failed with status code: {response.status_code}")
-        print(response.text)
 if __name__ == "__main__":

--- a/examples/pooling/score/vision_score_api_online.py
+++ b/examples/pooling/score/vision_score_api_online.py
@@ -17,48 +17,32 @@ e.g.
 """
 import argparse
-import base64
-import json
 import pprint
 import requests
+from vllm.multimodal.utils import encode_image_url, fetch_image
-def encode_base64_content_from_url(content_url: str) -> dict[str, str]:
-    """Encode a content retrieved from a remote url to base64 format."""
+query = "A woman playing with her dog on a beach at sunset."
+document = (
-    with requests.get(content_url, headers=headers) as response:
+    "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
-        response.raise_for_status()
+    "as the dog offers its paw in a heartwarming display of companionship and trust."
-        result = base64.b64encode(response.content).decode("utf-8")
+)
+image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-    return {"url": f"data:image/jpeg;base64,{result}"}
+documents = [
+    {
+        "type": "text",
-headers = {"accept": "application/json", "Content-Type": "application/json"}
+        "text": document,
+    },
-queries = "slm markdown"
+    {
-documents = {
+        "type": "image_url",
-    "content": [
+        "image_url": {"url": image_url},
-        {
+    },
-            "type": "image_url",
+    {
-            "image_url": {
+        "type": "image_url",
-                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+        "image_url": {"url": encode_image_url(fetch_image(image_url))},
-            },
+    },
-        },
+]
-        {
-            "type": "image_url",
-            "image_url": {
-                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-            },
-        },
-        {
-            "type": "image_url",
-            "image_url": encode_base64_content_from_url(
-                "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-            ),
-        },
-    ]
-}
 def parse_args():
@@ -73,15 +57,40 @@ def main(args):
    models_url = base_url + "/v1/models"
    score_url = base_url + "/score"
-    response = requests.get(models_url, headers=headers)
+    response = requests.get(models_url)
    model = response.json()["data"][0]["id"]
-    prompt = {"model": model, "queries": queries, "documents": documents}
+    print("Query: string & Document: string")
-    response = requests.post(score_url, headers=headers, json=prompt)
+    prompt = {"model": model, "queries": query, "documents": document}
-    print("\nPrompt when queries is string and documents is a image list:")
+    response = requests.post(score_url, json=prompt)
-    pprint.pprint(prompt)
+    pprint.pprint(response.json())
-    print("\nScore Response:")
-    print(json.dumps(response.json(), indent=2))
+    print("Query: string & Document: text")
+    prompt = {
+        "model": model,
+        "queries": query,
+        "documents": {"content": [documents[0]]},
+    }
+    response = requests.post(score_url, json=prompt)
+    pprint.pprint(response.json())
+    print("Query: string & Document: image url")
+    prompt = {
+        "model": model,
+        "queries": query,
+        "documents": {"content": [documents[1]]},
+    }
+    response = requests.post(score_url, json=prompt)
+    pprint.pprint(response.json())
+    print("Query: string & Document: image base64")
+    prompt = {
+        "model": model,
+        "queries": query,
+        "documents": {"content": [documents[2]]},
+    }
+    response = requests.post(score_url, json=prompt)
+    pprint.pprint(response.json())
 if __name__ == "__main__":

--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -9,5 +9,5 @@ wheel
 jinja2>=3.1.6
 regex
 build
-protobuf
+protobuf >= 6.33.5
 grpcio-tools
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -9,9 +9,9 @@ blake3
 py-cpuinfo
 transformers >= 4.56.0, < 5
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
-protobuf # Required by LlamaTokenizer, gRPC.
+protobuf >= 6.33.5 # Required by LlamaTokenizer, gRPC. CVE-2026-0994
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
-aiohttp
+aiohttp >= 3.13.3
 openai >= 1.99.1  # For Responses API with reasoning content
 pydantic >= 2.12.0
 prometheus_client >= 0.18.0

--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
-lmcache
+lmcache >= 0.3.9
 nixl >= 0.7.1 # Required for disaggregated prefill
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -14,7 +14,7 @@ pytest-shard==0.1.2
 # Async/HTTP dependencies
 anyio==4.6.2.post1
    # via httpx, starlette
-aiohttp==3.13.0
+aiohttp==3.13.3
    # via gpt-oss
 httpx==0.27.2
    # HTTP testing

--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -12,7 +12,7 @@ affine==2.4.0
    # via rasterio
 aiohappyeyeballs==2.6.1
    # via aiohttp
-aiohttp==3.13.0
+aiohttp==3.13.3
    # via
    #   aiohttp-cors
    #   datasets

--- a/setup.py
+++ b/setup.py
@@ -949,9 +949,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
    new_version_content = f"""
 try:
-    __version__ = "0.15.0"
+    __version__ = "0.15.1"
-    __version_tuple__ = (0, 15, 0)
+    __version_tuple__ = (0, 15, 1)
-    __hcu_version__ = f'0.15.0+{version}' 
+    __hcu_version__ = f'0.15.1+{version}' 
    from vllm.version import __version__, __version_tuple__, __hcu_version__
 except Exception as e:

--- a/tests/compile/test_cold_start.py
+++ b/tests/compile/test_cold_start.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from torch._dynamo.utils import counters
+from vllm import LLM
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
+def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
+    # Run in same process so we can access PyTorch's internal counters
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    # I'm not sure if this is going to affect the numbers
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "0")
+    # Force cold compilation
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+    compilation_config = CompilationConfig(
+        mode=CompilationMode.VLLM_COMPILE,
+        cudagraph_mode=CUDAGraphMode.NONE,  # make the model loading faster
+    )
+    counters.clear()
+    _ = LLM(
+        model="microsoft/Phi-tiny-MoE-instruct",
+        max_model_len=256,
+        load_format="dummy",  # make the model loading faster
+        compilation_config=compilation_config,
+        num_gpu_blocks_override=8,  # make the model loading faster
+    )
+    # vLLM-compile cold start is special. By default, we do
+    # one full dynamo capture of the entire forward pass.
+    # The forward pass consists of 32 transformer layers.
+    # Then, we split on the attention operation. This results in
+    # 33 subgraphs (not including the attention operation).
+    # The 33 subgraphs then get standalone_compile'd.
+    #
+    # There are actually only 3 unique subgraphs for this model
+    # (all of its transformer layers are the same modulo weights);
+    # this is true for most vLLM models.
+    # So we test that during cold start, the aot_autograd cache
+    # misses for 3 subgraphs and hits for the rest.
+    assert counters["aot_autograd"]["autograd_cache_miss"] == 3
+    assert counters["aot_autograd"]["autograd_cache_hit"] == 30
--- a/tests/compile/test_graph_partition.py
+++ b/tests/compile/test_graph_partition.py
@@ -8,6 +8,10 @@ import torch
 from torch.fx.experimental.proxy_tensor import make_fx
 from vllm.compilation.backends import split_graph
+from vllm.compilation.fx_utils import find_op_nodes
+# This import automatically registers `torch.ops.silly.attention`
+from . import silly_attention  # noqa: F401
 def test_getitem_moved_to_producer_subgraph():
@@ -122,3 +126,61 @@ def test_no_tuple_inputs_with_multiple_consumers():
    output_split = split_gm(new_x)
    assert torch.allclose(output_original, output_split), "Output mismatch after split"
+def test_consecutive_ops_in_split():
+    """
+    Test that consecutive splitting operations are grouped into the same subgraph
+    """
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        """
+        Define a simple model where consecutive operations create opportunities
+        for splitting subgraphs.
+        """
+        # Apply silly attention followed by consecutive operations
+        intermediate = torch.relu(x)
+        attn_inout = torch.sqrt(intermediate)
+        torch.ops.silly.attention(intermediate, intermediate, attn_inout, attn_inout)
+        final_result = torch.sigmoid(attn_inout)
+        return final_result
+    torch.set_default_device("cuda")
+    # Create the traced FX graph for the model
+    x = torch.randn(8, 4)
+    gm = make_fx(model_fn)(x)
+    # Assert presence of the expected operations in the setup
+    assert (
+        len(list(find_op_nodes(torch.ops.aten.relu, gm.graph))) == 1
+        and len(list(find_op_nodes(torch.ops.aten.sqrt, gm.graph))) == 1
+    ), "Test setup failed: Expected sqrt and relu operations in the graph."
+    # Configure split operations to test
+    splitting_ops = ["silly::attention", "aten::sqrt"]
+    split_gm, split_items = split_graph(gm, splitting_ops)
+    # Validate the number of partitions
+    assert len(split_items) == 3, (
+        "Consecutive splitting operations were not grouped correctly."
+    )
+    # Validate that correctness is preserved
+    new_x = torch.randn(8, 4)
+    output_original = gm(new_x)
+    output_split = split_gm(new_x)
+    assert torch.allclose(output_original, output_split), (
+        "Output mismatch after splitting."
+    )
+    # Check the splitting item has 2 nodes exactly (relu and attn)
+    splitting_items = list(s for s in split_items if s.is_splitting_graph)
+    assert len(splitting_items) == 1, "Expecting a single splitting graph"
+    print(splitting_items[0].graph.graph)
+    splitting_gm = splitting_items[0].graph
+    assert len(splitting_gm.graph.nodes) == 4, "Expecting 4 nodes in splitting graph"
+    assert [node.op for node in splitting_gm.graph.nodes] == ["placeholder"] + 2 * [
+        "call_function"
+    ] + ["output"]
--- a/tests/entrypoints/pooling/classify/test_online_vision.py
+++ b/tests/entrypoints/pooling/classify/test_online_vision.py
@@ -5,9 +5,9 @@ import json
 import pytest
 import requests
-from tests.entrypoints.test_utils import encode_base64_content_from_url
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
+from vllm.multimodal.utils import encode_image_url, fetch_image
 MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
 MAXIMUM_VIDEOS = 1
@@ -19,7 +19,7 @@ HF_OVERRIDES = {
 }
 input_text = "This product was excellent and exceeded my expectations"
 image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
-image_base64 = encode_base64_content_from_url(image_url)
+image_base64 = {"url": encode_image_url(fetch_image(image_url))}
 video_url = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"

--- a/tests/entrypoints/pooling/score/test_online_score_vision.py
+++ b/tests/entrypoints/pooling/score/test_online_score_vision.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import requests
+from tests.utils import VLLM_PATH, RemoteOpenAIServer
+from vllm.entrypoints.pooling.score.protocol import ScoreResponse
+from vllm.multimodal.utils import encode_image_url, fetch_image
+MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B"
+HF_OVERRIDES = {
+    "architectures": ["Qwen3VLForSequenceClassification"],
+    "classifier_from_token": ["no", "yes"],
+    "is_original_qwen3_reranker": True,
+}
+query = "A cat standing in the snow."
+image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
+documents = [
+    {
+        "type": "text",
+        "text": query,
+    },
+    {
+        "type": "image_url",
+        "image_url": {"url": image_url},
+    },
+    {
+        "type": "image_url",
+        "image_url": {"url": encode_image_url(fetch_image(image_url))},
+    },
+]
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--enforce-eager",
+        "--max-model-len",
+        "8192",
+        "--chat-template",
+        str(VLLM_PATH / "examples/pooling/score/template/qwen3_vl_reranker.jinja"),
+    ]
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
+    ) as remote_server:
+        yield remote_server
+def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer):
+    queries = "What is the capital of France?"
+    documents = "The capital of France is Paris."
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": queries,
+            "documents": documents,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 1
+def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer):
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": query,
+            "documents": {"content": [documents[0]]},
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 1
+def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIServer):
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": query,
+            "documents": {"content": [documents[1]]},
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 1
+def test_score_api_queries_str_documents_image_base64_content(
+    server: RemoteOpenAIServer,
+):
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": query,
+            "documents": {"content": [documents[2]]},
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 1
--- a/tests/entrypoints/test_utils.py
+++ b/tests/entrypoints/test_utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import base64
-import requests
 from vllm.entrypoints.utils import sanitize_message
@@ -12,11 +8,3 @@ def test_sanitize_message():
        sanitize_message("<_io.BytesIO object at 0x7a95e299e750>")
        == "<_io.BytesIO object>"
    )
-def encode_base64_content_from_url(content_url: str) -> dict[str, str]:
-    with requests.get(content_url) as response:
-        response.raise_for_status()
-        result = base64.b64encode(response.content).decode("utf-8")
-    return {"url": f"data:image/jpeg;base64,{result}"}
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -17,6 +17,8 @@ from vllm.model_executor.layers.activation import (
    QuickGELU,
    SiluAndMul,
    SwigluOAIAndMul,
+    SwigluStepAndMul,
+    swiglustep_and_mul_triton,
 )
 from vllm.utils.torch_utils import set_random_seed
@@ -36,6 +38,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e
        "gelu_tanh",
        "fatrelu",
        "swigluoai_and_mul",
+        "swiglustep_and_mul",
    ],
 )
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -75,9 +78,12 @@ def test_act_and_mul(
    elif activation == "swigluoai_and_mul":
        layer = SwigluOAIAndMul()
        fn = torch.ops._C.swigluoai_and_mul
+    elif activation == "swiglustep_and_mul":
+        layer = SwigluStepAndMul()
+        fn = swiglustep_and_mul_triton
    out = layer(x)
    ref_out = layer.forward_native(x)
-    if activation == "swigluoai_and_mul":
+    if activation in ["swigluoai_and_mul", "swiglustep_and_mul"]:
        rtol = {
            # For fp16, change the relative tolerance from 1e-3 to 2e-3
            torch.float16: 2e-3,
@@ -104,7 +110,7 @@ def test_act_and_mul(
        opcheck(fn, (out, x, threshold))
    elif activation == "swigluoai_and_mul":
        opcheck(fn, (out, x, layer.alpha, layer.limit))
-    else:
+    elif activation != "swiglustep_and_mul":
        opcheck(fn, (out, x))

--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -722,7 +722,7 @@ def test_mixtral_moe(
        # need to override the forward context for unittests, otherwise it assumes
        # we're running the model forward pass (the model specified in vllm_config)
-        get_forward_context().remaining_moe_layers = None
+        get_forward_context().all_moe_layers = None
        # Run forward passes for both MoE blocks
        hf_states, _ = hf_moe.forward(hf_inputs)