Merge tag 'v0.15.1' into v0.15.1-ori

fc7980db · zhuwenwen · 3eab7fef · 1892993b · fc7980db · fc7980db
Commit fc7980db authored Feb 05, 2026 by zhuwenwen
20 changed files
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -274,14 +274,14 @@ steps:
          - input-release-version
          - build-wheels

-      - label: "Upload release wheels to PyPI and GitHub"
+      - label: "Upload release wheels to PyPI"
        depends_on:
          - block-upload-release-wheels
        id: upload-release-wheels
        agents:
          queue: small_cpu_queue_postmerge
        commands:
-          - "bash .buildkite/scripts/upload-release-wheels.sh"
+          - "bash .buildkite/scripts/upload-release-wheels-pypi.sh"

  # =============================================================================
  # ROCm Release Pipeline (x86_64 only)

--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -11,58 +11,80 @@ fi
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel (by commit):
 \`\`\`
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .

-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
-\`\`\`
+(Optional) For CUDA 13.0:
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .

-To download the wheel (by version):
+(Optional) For CPU:
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
 \`\`\`
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .

-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
-\`\`\`

 To download and upload the image:

 \`\`\`
+Download images:
+
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm

+Tag and push images:
+
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker push vllm/vllm-openai:latest-x86_64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64

+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
+docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+docker push vllm/vllm-openai:latest-x86_64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
+
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64

+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker push vllm/vllm-openai:latest-aarch64-cu130
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
+docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
+
+Create multi-arch manifest:
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
 docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
 docker push vllm/vllm-openai-rocm:latest-base
 docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base

-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-docker push vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-
 docker manifest rm vllm/vllm-openai:latest
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
+
+docker manifest rm vllm/vllm-openai:latest-cu130
+docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
+docker manifest push vllm/vllm-openai:latest-cu130
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
 \`\`\`
 EOF 
--- a/.buildkite/scripts/upload-release-wheels.sh
+++ b/.buildkite/scripts/upload-release-wheels.sh
@@ -7,17 +7,19 @@ SUBPATH=$BUILDKITE_COMMIT
 S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"

 RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
-echo "Release version from Buildkite: $RELEASE_VERSION"
 GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
-if [ -z "$GIT_VERSION" ]; then
+
+echo "Release version from Buildkite: $RELEASE_VERSION"
+
+if [[ -z "$GIT_VERSION" ]]; then
    echo "[FATAL] Not on a git tag, cannot create release."
    exit 1
 else
    echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
 fi
 # sanity check for version mismatch
-if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
-  if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
+if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
+  if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
    echo "[WARNING] Force release and ignore version mismatch"
  else
    echo "[FATAL] Release version from Buildkite does not match Git version."
@@ -27,7 +29,7 @@ fi
 PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'

 # check pypi token
-if [ -z "$PYPI_TOKEN" ]; then
+if [[ -z "$PYPI_TOKEN" ]]; then
  echo "[FATAL] PYPI_TOKEN is not set."
  exit 1
 else
@@ -35,41 +37,8 @@ else
  export TWINE_PASSWORD="$PYPI_TOKEN"
 fi

-# check github token
-if [ -z "$GITHUB_TOKEN" ]; then
-  echo "[FATAL] GITHUB_TOKEN is not set."
-  exit 1
-else
-  export GH_TOKEN="$GITHUB_TOKEN"
-fi
-
 set -x # avoid printing secrets above

-# download gh CLI from github
-# Get latest gh CLI version from GitHub API
-GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
-if [ -z "$GH_VERSION" ]; then
-  echo "[FATAL] Failed to get latest gh CLI version from GitHub"
-  exit 1
-fi
-echo "Downloading gh CLI version: $GH_VERSION"
-GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
-GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
-GH_INSTALL_DIR="/tmp/gh-install"
-mkdir -p "$GH_INSTALL_DIR"
-pushd "$GH_INSTALL_DIR"
-curl -L -o "$GH_TARBALL" "$GH_URL"
-tar -xzf "$GH_TARBALL"
-GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
-if [ -z "$GH_BIN" ]; then
-  echo "[FATAL] Failed to find gh CLI executable"
-  exit 1
-fi
-echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
-echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
-command "$GH_BIN" release list --limit 5
-popd
-
 # install twine from pypi
 python3 -m venv /tmp/vllm-release-env
 source /tmp/vllm-release-env/bin/activate
@@ -89,16 +58,13 @@ echo "Wheels copied to local directory"
 git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
 ls -la $DIST_DIR

-
 # upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
 PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
-if [ -z "$PYPI_WHEEL_FILES" ]; then
+if [[ -z "$PYPI_WHEEL_FILES" ]]; then
  echo "No default variant wheels found, quitting..."
  exit 1
 fi
+
 python3 -m twine check $PYPI_WHEEL_FILES
-python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
+python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
 echo "Wheels uploaded to PyPI"
-
-# create release on GitHub with the release version and all wheels
-command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -456,6 +456,7 @@ th {
 | `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |
 | `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ |
 | `Step1ForCausalLM` | Step-Audio | `stepfun-ai/Step-Audio-EditX`, etc. | ✅︎ | ✅︎ |
+| `Step3p5ForCausalLM` | Step-3.5-flash | `stepfun-ai/step-3.5-flash`, etc. |  | ✅︎ |
 | `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
 | `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ |
 | `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ |

--- a/examples/pooling/score/vision_rerank_api_online.py
+++ b/examples/pooling/score/vision_rerank_api_online.py
@@ -18,48 +18,32 @@ e.g.
 """

 import argparse
-import base64
-import json
+import pprint

 import requests

-
-def encode_base64_content_from_url(content_url: str) -> dict[str, str]:
-    """Encode a content retrieved from a remote url to base64 format."""
-
-    with requests.get(content_url, headers=headers) as response:
-        response.raise_for_status()
-        result = base64.b64encode(response.content).decode("utf-8")
-
-    return {"url": f"data:image/jpeg;base64,{result}"}
-
-
-headers = {"accept": "application/json", "Content-Type": "application/json"}
+from vllm.multimodal.utils import encode_image_url, fetch_image

 query = "A woman playing with her dog on a beach at sunset."
-documents = {
-    "content": [
-        {
-            "type": "text",
-            "text": (
+document = (
    "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
    "as the dog offers its paw in a heartwarming display of companionship and trust."
-            ),
+)
+image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+documents = [
+    {
+        "type": "text",
+        "text": document,
    },
    {
        "type": "image_url",
-            "image_url": {
-                "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-            },
+        "image_url": {"url": image_url},
    },
    {
        "type": "image_url",
-            "image_url": encode_base64_content_from_url(
-                "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-            ),
+        "image_url": {"url": encode_image_url(fetch_image(image_url))},
    },
-    ]
-}
+]


 def parse_args():
@@ -74,23 +58,36 @@ def main(args):
    models_url = base_url + "/v1/models"
    rerank_url = base_url + "/rerank"

-    response = requests.get(models_url, headers=headers)
+    response = requests.get(models_url)
    model = response.json()["data"][0]["id"]

-    data = {
+    print("Query: string & Document: list of string")
+    prompt = {"model": model, "query": query, "documents": [document]}
+    response = requests.post(rerank_url, json=prompt)
+    pprint.pprint(response.json())
+
+    print("Query: string & Document: text")
+    prompt = {"model": model, "query": query, "documents": {"content": [documents[0]]}}
+    response = requests.post(rerank_url, json=prompt)
+    pprint.pprint(response.json())
+
+    print("Query: string & Document: image url")
+    prompt = {
+        "model": model,
+        "query": query,
+        "documents": {"content": [documents[1]]},
+    }
+    response = requests.post(rerank_url, json=prompt)
+    pprint.pprint(response.json())
+
+    print("Query: string & Document: image base64")
+    prompt = {
        "model": model,
        "query": query,
-        "documents": documents,
+        "documents": {"content": [documents[2]]},
    }
-    response = requests.post(rerank_url, headers=headers, json=data)
-
-    # Check the response
-    if response.status_code == 200:
-        print("Request successful!")
-        print(json.dumps(response.json(), indent=2))
-    else:
-        print(f"Request failed with status code: {response.status_code}")
-        print(response.text)
+    response = requests.post(rerank_url, json=prompt)
+    pprint.pprint(response.json())


 if __name__ == "__main__":

--- a/examples/pooling/score/vision_score_api_online.py
+++ b/examples/pooling/score/vision_score_api_online.py
@@ -17,48 +17,32 @@ e.g.
 """

 import argparse
-import base64
-import json
 import pprint

 import requests

+from vllm.multimodal.utils import encode_image_url, fetch_image

-def encode_base64_content_from_url(content_url: str) -> dict[str, str]:
-    """Encode a content retrieved from a remote url to base64 format."""
-
-    with requests.get(content_url, headers=headers) as response:
-        response.raise_for_status()
-        result = base64.b64encode(response.content).decode("utf-8")
-
-    return {"url": f"data:image/jpeg;base64,{result}"}
-
-
-headers = {"accept": "application/json", "Content-Type": "application/json"}
-
-queries = "slm markdown"
-documents = {
-    "content": [
+query = "A woman playing with her dog on a beach at sunset."
+document = (
+    "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
+    "as the dog offers its paw in a heartwarming display of companionship and trust."
+)
+image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+documents = [
    {
-            "type": "image_url",
-            "image_url": {
-                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
-            },
+        "type": "text",
+        "text": document,
    },
    {
        "type": "image_url",
-            "image_url": {
-                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-            },
+        "image_url": {"url": image_url},
    },
    {
        "type": "image_url",
-            "image_url": encode_base64_content_from_url(
-                "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-            ),
+        "image_url": {"url": encode_image_url(fetch_image(image_url))},
    },
-    ]
-}
+]


 def parse_args():
@@ -73,15 +57,40 @@ def main(args):
    models_url = base_url + "/v1/models"
    score_url = base_url + "/score"

-    response = requests.get(models_url, headers=headers)
+    response = requests.get(models_url)
    model = response.json()["data"][0]["id"]

-    prompt = {"model": model, "queries": queries, "documents": documents}
-    response = requests.post(score_url, headers=headers, json=prompt)
-    print("\nPrompt when queries is string and documents is a image list:")
-    pprint.pprint(prompt)
-    print("\nScore Response:")
-    print(json.dumps(response.json(), indent=2))
+    print("Query: string & Document: string")
+    prompt = {"model": model, "queries": query, "documents": document}
+    response = requests.post(score_url, json=prompt)
+    pprint.pprint(response.json())
+
+    print("Query: string & Document: text")
+    prompt = {
+        "model": model,
+        "queries": query,
+        "documents": {"content": [documents[0]]},
+    }
+    response = requests.post(score_url, json=prompt)
+    pprint.pprint(response.json())
+
+    print("Query: string & Document: image url")
+    prompt = {
+        "model": model,
+        "queries": query,
+        "documents": {"content": [documents[1]]},
+    }
+    response = requests.post(score_url, json=prompt)
+    pprint.pprint(response.json())
+
+    print("Query: string & Document: image base64")
+    prompt = {
+        "model": model,
+        "queries": query,
+        "documents": {"content": [documents[2]]},
+    }
+    response = requests.post(score_url, json=prompt)
+    pprint.pprint(response.json())


 if __name__ == "__main__":

--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -9,5 +9,5 @@ wheel
 jinja2>=3.1.6
 regex
 build
-protobuf
+protobuf >= 6.33.5
 grpcio-tools
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -9,9 +9,9 @@ blake3
 py-cpuinfo
 transformers >= 4.56.0, < 5
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
-protobuf # Required by LlamaTokenizer, gRPC.
+protobuf >= 6.33.5 # Required by LlamaTokenizer, gRPC. CVE-2026-0994
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
-aiohttp
+aiohttp >= 3.13.3
 openai >= 1.99.1  # For Responses API with reasoning content
 pydantic >= 2.12.0
 prometheus_client >= 0.18.0

--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
-lmcache
+lmcache >= 0.3.9
 nixl >= 0.7.1 # Required for disaggregated prefill
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -14,7 +14,7 @@ pytest-shard==0.1.2
 # Async/HTTP dependencies
 anyio==4.6.2.post1
    # via httpx, starlette
-aiohttp==3.13.0
+aiohttp==3.13.3
    # via gpt-oss
 httpx==0.27.2
    # HTTP testing

--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -12,7 +12,7 @@ affine==2.4.0
    # via rasterio
 aiohappyeyeballs==2.6.1
    # via aiohttp
-aiohttp==3.13.0
+aiohttp==3.13.3
    # via
    #   aiohttp-cors
    #   datasets

--- a/tests/compile/test_cold_start.py
+++ b/tests/compile/test_cold_start.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from torch._dynamo.utils import counters
+
+from vllm import LLM
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
+
+
+def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
+    # Run in same process so we can access PyTorch's internal counters
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    # I'm not sure if this is going to affect the numbers
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "0")
+
+    # Force cold compilation
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    compilation_config = CompilationConfig(
+        mode=CompilationMode.VLLM_COMPILE,
+        cudagraph_mode=CUDAGraphMode.NONE,  # make the model loading faster
+    )
+
+    counters.clear()
+
+    _ = LLM(
+        model="microsoft/Phi-tiny-MoE-instruct",
+        max_model_len=256,
+        load_format="dummy",  # make the model loading faster
+        compilation_config=compilation_config,
+        num_gpu_blocks_override=8,  # make the model loading faster
+    )
+
+    # vLLM-compile cold start is special. By default, we do
+    # one full dynamo capture of the entire forward pass.
+    # The forward pass consists of 32 transformer layers.
+    # Then, we split on the attention operation. This results in
+    # 33 subgraphs (not including the attention operation).
+    # The 33 subgraphs then get standalone_compile'd.
+    #
+    # There are actually only 3 unique subgraphs for this model
+    # (all of its transformer layers are the same modulo weights);
+    # this is true for most vLLM models.
+    # So we test that during cold start, the aot_autograd cache
+    # misses for 3 subgraphs and hits for the rest.
+    assert counters["aot_autograd"]["autograd_cache_miss"] == 3
+    assert counters["aot_autograd"]["autograd_cache_hit"] == 30
--- a/tests/compile/test_graph_partition.py
+++ b/tests/compile/test_graph_partition.py
@@ -8,6 +8,10 @@ import torch
 from torch.fx.experimental.proxy_tensor import make_fx

 from vllm.compilation.backends import split_graph
+from vllm.compilation.fx_utils import find_op_nodes
+
+# This import automatically registers `torch.ops.silly.attention`
+from . import silly_attention  # noqa: F401


 def test_getitem_moved_to_producer_subgraph():
@@ -122,3 +126,61 @@ def test_no_tuple_inputs_with_multiple_consumers():
    output_split = split_gm(new_x)

    assert torch.allclose(output_original, output_split), "Output mismatch after split"
+
+
+def test_consecutive_ops_in_split():
+    """
+    Test that consecutive splitting operations are grouped into the same subgraph
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        """
+        Define a simple model where consecutive operations create opportunities
+        for splitting subgraphs.
+        """
+        # Apply silly attention followed by consecutive operations
+        intermediate = torch.relu(x)
+        attn_inout = torch.sqrt(intermediate)
+        torch.ops.silly.attention(intermediate, intermediate, attn_inout, attn_inout)
+        final_result = torch.sigmoid(attn_inout)
+        return final_result
+
+    torch.set_default_device("cuda")
+
+    # Create the traced FX graph for the model
+    x = torch.randn(8, 4)
+
+    gm = make_fx(model_fn)(x)
+
+    # Assert presence of the expected operations in the setup
+    assert (
+        len(list(find_op_nodes(torch.ops.aten.relu, gm.graph))) == 1
+        and len(list(find_op_nodes(torch.ops.aten.sqrt, gm.graph))) == 1
+    ), "Test setup failed: Expected sqrt and relu operations in the graph."
+
+    # Configure split operations to test
+    splitting_ops = ["silly::attention", "aten::sqrt"]
+    split_gm, split_items = split_graph(gm, splitting_ops)
+
+    # Validate the number of partitions
+    assert len(split_items) == 3, (
+        "Consecutive splitting operations were not grouped correctly."
+    )
+
+    # Validate that correctness is preserved
+    new_x = torch.randn(8, 4)
+    output_original = gm(new_x)
+    output_split = split_gm(new_x)
+    assert torch.allclose(output_original, output_split), (
+        "Output mismatch after splitting."
+    )
+
+    # Check the splitting item has 2 nodes exactly (relu and attn)
+    splitting_items = list(s for s in split_items if s.is_splitting_graph)
+    assert len(splitting_items) == 1, "Expecting a single splitting graph"
+    print(splitting_items[0].graph.graph)
+    splitting_gm = splitting_items[0].graph
+    assert len(splitting_gm.graph.nodes) == 4, "Expecting 4 nodes in splitting graph"
+    assert [node.op for node in splitting_gm.graph.nodes] == ["placeholder"] + 2 * [
+        "call_function"
+    ] + ["output"]
--- a/tests/entrypoints/pooling/classify/test_online_vision.py
+++ b/tests/entrypoints/pooling/classify/test_online_vision.py
@@ -5,9 +5,9 @@ import json
 import pytest
 import requests

-from tests.entrypoints.test_utils import encode_base64_content_from_url
 from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
+from vllm.multimodal.utils import encode_image_url, fetch_image

 MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
 MAXIMUM_VIDEOS = 1
@@ -19,7 +19,7 @@ HF_OVERRIDES = {
 }
 input_text = "This product was excellent and exceeded my expectations"
 image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
-image_base64 = encode_base64_content_from_url(image_url)
+image_base64 = {"url": encode_image_url(fetch_image(image_url))}
 video_url = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"



--- a/tests/entrypoints/pooling/score/test_online_score_vision.py
+++ b/tests/entrypoints/pooling/score/test_online_score_vision.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import requests
+
+from tests.utils import VLLM_PATH, RemoteOpenAIServer
+from vllm.entrypoints.pooling.score.protocol import ScoreResponse
+from vllm.multimodal.utils import encode_image_url, fetch_image
+
+MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B"
+HF_OVERRIDES = {
+    "architectures": ["Qwen3VLForSequenceClassification"],
+    "classifier_from_token": ["no", "yes"],
+    "is_original_qwen3_reranker": True,
+}
+
+query = "A cat standing in the snow."
+image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
+documents = [
+    {
+        "type": "text",
+        "text": query,
+    },
+    {
+        "type": "image_url",
+        "image_url": {"url": image_url},
+    },
+    {
+        "type": "image_url",
+        "image_url": {"url": encode_image_url(fetch_image(image_url))},
+    },
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--enforce-eager",
+        "--max-model-len",
+        "8192",
+        "--chat-template",
+        str(VLLM_PATH / "examples/pooling/score/template/qwen3_vl_reranker.jinja"),
+    ]
+
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
+    ) as remote_server:
+        yield remote_server
+
+
+def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer):
+    queries = "What is the capital of France?"
+    documents = "The capital of France is Paris."
+
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": queries,
+            "documents": documents,
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 1
+
+
+def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer):
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": query,
+            "documents": {"content": [documents[0]]},
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 1
+
+
+def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIServer):
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": query,
+            "documents": {"content": [documents[1]]},
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 1
+
+
+def test_score_api_queries_str_documents_image_base64_content(
+    server: RemoteOpenAIServer,
+):
+    score_response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": MODEL_NAME,
+            "queries": query,
+            "documents": {"content": [documents[2]]},
+        },
+    )
+    score_response.raise_for_status()
+    score = ScoreResponse.model_validate(score_response.json())
+
+    assert score.id is not None
+    assert score.data is not None
+    assert len(score.data) == 1
--- a/tests/entrypoints/test_utils.py
+++ b/tests/entrypoints/test_utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import base64
-
-import requests
-
 from vllm.entrypoints.utils import sanitize_message


@@ -12,11 +8,3 @@ def test_sanitize_message():
        sanitize_message("<_io.BytesIO object at 0x7a95e299e750>")
        == "<_io.BytesIO object>"
    )
-
-
-def encode_base64_content_from_url(content_url: str) -> dict[str, str]:
-    with requests.get(content_url) as response:
-        response.raise_for_status()
-        result = base64.b64encode(response.content).decode("utf-8")
-
-    return {"url": f"data:image/jpeg;base64,{result}"}
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -17,6 +17,8 @@ from vllm.model_executor.layers.activation import (
    QuickGELU,
    SiluAndMul,
    SwigluOAIAndMul,
+    SwigluStepAndMul,
+    swiglustep_and_mul_triton,
 )
 from vllm.utils.torch_utils import set_random_seed

@@ -36,6 +38,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e
        "gelu_tanh",
        "fatrelu",
        "swigluoai_and_mul",
+        "swiglustep_and_mul",
    ],
 )
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -75,9 +78,12 @@ def test_act_and_mul(
    elif activation == "swigluoai_and_mul":
        layer = SwigluOAIAndMul()
        fn = torch.ops._C.swigluoai_and_mul
+    elif activation == "swiglustep_and_mul":
+        layer = SwigluStepAndMul()
+        fn = swiglustep_and_mul_triton
    out = layer(x)
    ref_out = layer.forward_native(x)
-    if activation == "swigluoai_and_mul":
+    if activation in ["swigluoai_and_mul", "swiglustep_and_mul"]:
        rtol = {
            # For fp16, change the relative tolerance from 1e-3 to 2e-3
            torch.float16: 2e-3,
@@ -104,7 +110,7 @@ def test_act_and_mul(
        opcheck(fn, (out, x, threshold))
    elif activation == "swigluoai_and_mul":
        opcheck(fn, (out, x, layer.alpha, layer.limit))
-    else:
+    elif activation != "swiglustep_and_mul":
        opcheck(fn, (out, x))



--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -715,7 +715,7 @@ def test_mixtral_moe(

        # need to override the forward context for unittests, otherwise it assumes
        # we're running the model forward pass (the model specified in vllm_config)
-        get_forward_context().remaining_moe_layers = None
+        get_forward_context().all_moe_layers = None

        # Run forward passes for both MoE blocks
        hf_states, _ = hf_moe.forward(hf_inputs)

--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -87,6 +87,13 @@ NKM_FACTORS_WVSPLITK_FP8 = [
 SEEDS = [0]


+def pad_weights_fp8(weight):
+    num_pad = 256 // weight.element_size()
+    import torch.nn.functional as F
+
+    return F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
+
+
 @pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITKRC)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
@@ -191,11 +198,12 @@ def test_rocm_wvsplitk_bias2D_kernel(n, k, m, dtype, seed):
 @pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("padded", [False, True])
 @pytest.mark.skipif(
    not (current_platform.is_rocm() and current_platform.supports_fp8()),
    reason="only test for rocm fp8",
 )
-def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
+def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed, padded):
    torch.manual_seed(seed)

    A = torch.rand(n, k, device="cuda") - 0.5
@@ -203,6 +211,8 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):

    A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
    B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
+    if padded:
+        B = pad_weights_fp8(B)

    ref_out = torch._scaled_mm(
        A, B.t(), out_dtype=dtype, scale_a=scale_a, scale_b=scale_b
@@ -222,11 +232,12 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
 @pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("padded", [False, True])
 @pytest.mark.skipif(
    not (current_platform.is_rocm() and current_platform.supports_fp8()),
    reason="only test for rocm fp8",
 )
-def test_rocm_wvsplitk_fp8_bias1D_kernel(n, k, m, dtype, seed):
+def test_rocm_wvsplitk_fp8_bias1D_kernel(n, k, m, dtype, seed, padded):
    torch.manual_seed(seed)

    xavier = math.sqrt(2 / k)  # normalize to avoid large output-bias deltas
@@ -236,6 +247,8 @@ def test_rocm_wvsplitk_fp8_bias1D_kernel(n, k, m, dtype, seed):

    A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
    B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
+    if padded:
+        B = pad_weights_fp8(B)

    ref_out = torch._scaled_mm(
        A, B.t(), out_dtype=dtype, scale_a=scale_a, scale_b=scale_b, bias=BIAS

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -479,6 +479,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Step1ForCausalLM": _HfExamplesInfo(
        "stepfun-ai/Step-Audio-EditX", trust_remote_code=True
    ),
+    "Step3p5ForCausalLM": _HfExamplesInfo(
+        "stepfun-ai/step-3.5-flash", is_available_online=False
+    ),
    "SmolLM3ForCausalLM": _HfExamplesInfo("HuggingFaceTB/SmolLM3-3B"),
    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),
    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
@@ -1091,6 +1094,12 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
    "Qwen3NextMTP": _HfExamplesInfo(
        "Qwen/Qwen3-Next-80B-A3B-Instruct", min_transformers_version="4.56.3"
    ),
+    "Step3p5MTP": _HfExamplesInfo(
+        "stepfun-ai/Step-3.5-Flash",
+        trust_remote_code=True,
+        speculative_model="stepfun-ai/Step-3.5-Flash",
+        is_available_online=False,
+    ),
 }

 _TRANSFORMERS_BACKEND_MODELS = {