Commit fc7980db authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.15.1' into v0.15.1-ori

parents 3eab7fef 1892993b
......@@ -274,14 +274,14 @@ steps:
- input-release-version
- build-wheels
- label: "Upload release wheels to PyPI and GitHub"
- label: "Upload release wheels to PyPI"
depends_on:
- block-upload-release-wheels
id: upload-release-wheels
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/upload-release-wheels.sh"
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
# =============================================================================
# ROCm Release Pipeline (x86_64 only)
......
......@@ -11,58 +11,80 @@ fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel (by commit):
\`\`\`
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
(Optional) For CUDA 13.0:
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
To download the wheel (by version):
(Optional) For CPU:
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
Download images:
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
Tag and push images:
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker push vllm/vllm-openai:latest-x86_64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker push vllm/vllm-openai:latest-aarch64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-rocm
Create multi-arch manifest:
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
docker manifest rm vllm/vllm-openai:latest-cu130
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker manifest push vllm/vllm-openai:latest-cu130
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
\`\`\`
EOF
......@@ -7,17 +7,19 @@ SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
RELEASE_VERSION=$(buildkite-agent meta-data get release-version)
echo "Release version from Buildkite: $RELEASE_VERSION"
GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null)
if [ -z "$GIT_VERSION" ]; then
echo "Release version from Buildkite: $RELEASE_VERSION"
if [[ -z "$GIT_VERSION" ]]; then
echo "[FATAL] Not on a git tag, cannot create release."
exit 1
else
echo "Git version for commit $BUILDKITE_COMMIT: $GIT_VERSION"
fi
# sanity check for version mismatch
if [ "$RELEASE_VERSION" != "$GIT_VERSION" ]; then
if [ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]; then
if [[ "$RELEASE_VERSION" != "$GIT_VERSION" ]]; then
if [[ "$FORCE_RELEASE_IGNORE_VERSION_MISMATCH" == "true" ]]; then
echo "[WARNING] Force release and ignore version mismatch"
else
echo "[FATAL] Release version from Buildkite does not match Git version."
......@@ -27,7 +29,7 @@ fi
PURE_VERSION=${RELEASE_VERSION#v} # remove leading 'v'
# check pypi token
if [ -z "$PYPI_TOKEN" ]; then
if [[ -z "$PYPI_TOKEN" ]]; then
echo "[FATAL] PYPI_TOKEN is not set."
exit 1
else
......@@ -35,41 +37,8 @@ else
export TWINE_PASSWORD="$PYPI_TOKEN"
fi
# check github token
if [ -z "$GITHUB_TOKEN" ]; then
echo "[FATAL] GITHUB_TOKEN is not set."
exit 1
else
export GH_TOKEN="$GITHUB_TOKEN"
fi
set -x # avoid printing secrets above
# download gh CLI from github
# Get latest gh CLI version from GitHub API
GH_VERSION=$(curl -s https://api.github.com/repos/cli/cli/releases/latest | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/' | sed 's/^v//')
if [ -z "$GH_VERSION" ]; then
echo "[FATAL] Failed to get latest gh CLI version from GitHub"
exit 1
fi
echo "Downloading gh CLI version: $GH_VERSION"
GH_TARBALL="gh_${GH_VERSION}_linux_amd64.tar.gz"
GH_URL="https://github.com/cli/cli/releases/download/v${GH_VERSION}/${GH_TARBALL}"
GH_INSTALL_DIR="/tmp/gh-install"
mkdir -p "$GH_INSTALL_DIR"
pushd "$GH_INSTALL_DIR"
curl -L -o "$GH_TARBALL" "$GH_URL"
tar -xzf "$GH_TARBALL"
GH_BIN=$(realpath $(find . -name "gh" -type f -executable | head -n 1))
if [ -z "$GH_BIN" ]; then
echo "[FATAL] Failed to find gh CLI executable"
exit 1
fi
echo "gh CLI downloaded successfully, version: $($GH_BIN --version)"
echo "Last 5 releases on GitHub:" # as a sanity check of gh and GH_TOKEN
command "$GH_BIN" release list --limit 5
popd
# install twine from pypi
python3 -m venv /tmp/vllm-release-env
source /tmp/vllm-release-env/bin/activate
......@@ -89,16 +58,13 @@ echo "Wheels copied to local directory"
git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT
ls -la $DIST_DIR
# upload wheels to PyPI (only default variant, i.e. files without '+' in the name)
PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*")
if [ -z "$PYPI_WHEEL_FILES" ]; then
if [[ -z "$PYPI_WHEEL_FILES" ]]; then
echo "No default variant wheels found, quitting..."
exit 1
fi
python3 -m twine check $PYPI_WHEEL_FILES
python3 -m twine --non-interactive --verbose upload $PYPI_WHEEL_FILES
python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES
echo "Wheels uploaded to PyPI"
# create release on GitHub with the release version and all wheels
command "$GH_BIN" release create $GIT_VERSION -d --latest --notes-from-tag --verify-tag $DIST_DIR/*.whl
......@@ -456,6 +456,7 @@ th {
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | |
| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ |
| `Step1ForCausalLM` | Step-Audio | `stepfun-ai/Step-Audio-EditX`, etc. | ✅︎ | ✅︎ |
| `Step3p5ForCausalLM` | Step-3.5-flash | `stepfun-ai/step-3.5-flash`, etc. | | ✅︎ |
| `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ |
| `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ |
| `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ |
......
......@@ -18,48 +18,32 @@ e.g.
"""
import argparse
import base64
import json
import pprint
import requests
def encode_base64_content_from_url(content_url: str) -> dict[str, str]:
"""Encode a content retrieved from a remote url to base64 format."""
with requests.get(content_url, headers=headers) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode("utf-8")
return {"url": f"data:image/jpeg;base64,{result}"}
headers = {"accept": "application/json", "Content-Type": "application/json"}
from vllm.multimodal.utils import encode_image_url, fetch_image
query = "A woman playing with her dog on a beach at sunset."
documents = {
"content": [
{
"type": "text",
"text": (
document = (
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
"as the dog offers its paw in a heartwarming display of companionship and trust."
),
)
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
documents = [
{
"type": "text",
"text": document,
},
{
"type": "image_url",
"image_url": {
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
},
"image_url": {"url": image_url},
},
{
"type": "image_url",
"image_url": encode_base64_content_from_url(
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
),
"image_url": {"url": encode_image_url(fetch_image(image_url))},
},
]
}
]
def parse_args():
......@@ -74,23 +58,36 @@ def main(args):
models_url = base_url + "/v1/models"
rerank_url = base_url + "/rerank"
response = requests.get(models_url, headers=headers)
response = requests.get(models_url)
model = response.json()["data"][0]["id"]
data = {
print("Query: string & Document: list of string")
prompt = {"model": model, "query": query, "documents": [document]}
response = requests.post(rerank_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: text")
prompt = {"model": model, "query": query, "documents": {"content": [documents[0]]}}
response = requests.post(rerank_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: image url")
prompt = {
"model": model,
"query": query,
"documents": {"content": [documents[1]]},
}
response = requests.post(rerank_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: image base64")
prompt = {
"model": model,
"query": query,
"documents": documents,
"documents": {"content": [documents[2]]},
}
response = requests.post(rerank_url, headers=headers, json=data)
# Check the response
if response.status_code == 200:
print("Request successful!")
print(json.dumps(response.json(), indent=2))
else:
print(f"Request failed with status code: {response.status_code}")
print(response.text)
response = requests.post(rerank_url, json=prompt)
pprint.pprint(response.json())
if __name__ == "__main__":
......
......@@ -17,48 +17,32 @@ e.g.
"""
import argparse
import base64
import json
import pprint
import requests
from vllm.multimodal.utils import encode_image_url, fetch_image
def encode_base64_content_from_url(content_url: str) -> dict[str, str]:
"""Encode a content retrieved from a remote url to base64 format."""
with requests.get(content_url, headers=headers) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode("utf-8")
return {"url": f"data:image/jpeg;base64,{result}"}
headers = {"accept": "application/json", "Content-Type": "application/json"}
queries = "slm markdown"
documents = {
"content": [
query = "A woman playing with her dog on a beach at sunset."
document = (
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "
"as the dog offers its paw in a heartwarming display of companionship and trust."
)
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
documents = [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
"type": "text",
"text": document,
},
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
"image_url": {"url": image_url},
},
{
"type": "image_url",
"image_url": encode_base64_content_from_url(
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
),
"image_url": {"url": encode_image_url(fetch_image(image_url))},
},
]
}
]
def parse_args():
......@@ -73,15 +57,40 @@ def main(args):
models_url = base_url + "/v1/models"
score_url = base_url + "/score"
response = requests.get(models_url, headers=headers)
response = requests.get(models_url)
model = response.json()["data"][0]["id"]
prompt = {"model": model, "queries": queries, "documents": documents}
response = requests.post(score_url, headers=headers, json=prompt)
print("\nPrompt when queries is string and documents is a image list:")
pprint.pprint(prompt)
print("\nScore Response:")
print(json.dumps(response.json(), indent=2))
print("Query: string & Document: string")
prompt = {"model": model, "queries": query, "documents": document}
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: text")
prompt = {
"model": model,
"queries": query,
"documents": {"content": [documents[0]]},
}
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: image url")
prompt = {
"model": model,
"queries": query,
"documents": {"content": [documents[1]]},
}
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: image base64")
prompt = {
"model": model,
"queries": query,
"documents": {"content": [documents[2]]},
}
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
if __name__ == "__main__":
......
......@@ -9,5 +9,5 @@ wheel
jinja2>=3.1.6
regex
build
protobuf
protobuf >= 6.33.5
grpcio-tools
......@@ -9,9 +9,9 @@ blake3
py-cpuinfo
transformers >= 4.56.0, < 5
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer, gRPC.
protobuf >= 6.33.5 # Required by LlamaTokenizer, gRPC. CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp
aiohttp >= 3.13.3
openai >= 1.99.1 # For Responses API with reasoning content
pydantic >= 2.12.0
prometheus_client >= 0.18.0
......
lmcache
lmcache >= 0.3.9
nixl >= 0.7.1 # Required for disaggregated prefill
......@@ -14,7 +14,7 @@ pytest-shard==0.1.2
# Async/HTTP dependencies
anyio==4.6.2.post1
# via httpx, starlette
aiohttp==3.13.0
aiohttp==3.13.3
# via gpt-oss
httpx==0.27.2
# HTTP testing
......
......@@ -12,7 +12,7 @@ affine==2.4.0
# via rasterio
aiohappyeyeballs==2.6.1
# via aiohttp
aiohttp==3.13.0
aiohttp==3.13.3
# via
# aiohttp-cors
# datasets
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from torch._dynamo.utils import counters
from vllm import LLM
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
# Run in same process so we can access PyTorch's internal counters
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
# I'm not sure if this is going to affect the numbers
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "0")
# Force cold compilation
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
compilation_config = CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
cudagraph_mode=CUDAGraphMode.NONE, # make the model loading faster
)
counters.clear()
_ = LLM(
model="microsoft/Phi-tiny-MoE-instruct",
max_model_len=256,
load_format="dummy", # make the model loading faster
compilation_config=compilation_config,
num_gpu_blocks_override=8, # make the model loading faster
)
# vLLM-compile cold start is special. By default, we do
# one full dynamo capture of the entire forward pass.
# The forward pass consists of 32 transformer layers.
# Then, we split on the attention operation. This results in
# 33 subgraphs (not including the attention operation).
# The 33 subgraphs then get standalone_compile'd.
#
# There are actually only 3 unique subgraphs for this model
# (all of its transformer layers are the same modulo weights);
# this is true for most vLLM models.
# So we test that during cold start, the aot_autograd cache
# misses for 3 subgraphs and hits for the rest.
assert counters["aot_autograd"]["autograd_cache_miss"] == 3
assert counters["aot_autograd"]["autograd_cache_hit"] == 30
......@@ -8,6 +8,10 @@ import torch
from torch.fx.experimental.proxy_tensor import make_fx
from vllm.compilation.backends import split_graph
from vllm.compilation.fx_utils import find_op_nodes
# This import automatically registers `torch.ops.silly.attention`
from . import silly_attention # noqa: F401
def test_getitem_moved_to_producer_subgraph():
......@@ -122,3 +126,61 @@ def test_no_tuple_inputs_with_multiple_consumers():
output_split = split_gm(new_x)
assert torch.allclose(output_original, output_split), "Output mismatch after split"
def test_consecutive_ops_in_split():
"""
Test that consecutive splitting operations are grouped into the same subgraph
"""
def model_fn(x: torch.Tensor) -> torch.Tensor:
"""
Define a simple model where consecutive operations create opportunities
for splitting subgraphs.
"""
# Apply silly attention followed by consecutive operations
intermediate = torch.relu(x)
attn_inout = torch.sqrt(intermediate)
torch.ops.silly.attention(intermediate, intermediate, attn_inout, attn_inout)
final_result = torch.sigmoid(attn_inout)
return final_result
torch.set_default_device("cuda")
# Create the traced FX graph for the model
x = torch.randn(8, 4)
gm = make_fx(model_fn)(x)
# Assert presence of the expected operations in the setup
assert (
len(list(find_op_nodes(torch.ops.aten.relu, gm.graph))) == 1
and len(list(find_op_nodes(torch.ops.aten.sqrt, gm.graph))) == 1
), "Test setup failed: Expected sqrt and relu operations in the graph."
# Configure split operations to test
splitting_ops = ["silly::attention", "aten::sqrt"]
split_gm, split_items = split_graph(gm, splitting_ops)
# Validate the number of partitions
assert len(split_items) == 3, (
"Consecutive splitting operations were not grouped correctly."
)
# Validate that correctness is preserved
new_x = torch.randn(8, 4)
output_original = gm(new_x)
output_split = split_gm(new_x)
assert torch.allclose(output_original, output_split), (
"Output mismatch after splitting."
)
# Check the splitting item has 2 nodes exactly (relu and attn)
splitting_items = list(s for s in split_items if s.is_splitting_graph)
assert len(splitting_items) == 1, "Expecting a single splitting graph"
print(splitting_items[0].graph.graph)
splitting_gm = splitting_items[0].graph
assert len(splitting_gm.graph.nodes) == 4, "Expecting 4 nodes in splitting graph"
assert [node.op for node in splitting_gm.graph.nodes] == ["placeholder"] + 2 * [
"call_function"
] + ["output"]
......@@ -5,9 +5,9 @@ import json
import pytest
import requests
from tests.entrypoints.test_utils import encode_base64_content_from_url
from tests.utils import RemoteOpenAIServer
from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
from vllm.multimodal.utils import encode_image_url, fetch_image
MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
MAXIMUM_VIDEOS = 1
......@@ -19,7 +19,7 @@ HF_OVERRIDES = {
}
input_text = "This product was excellent and exceeded my expectations"
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
image_base64 = encode_base64_content_from_url(image_url)
image_base64 = {"url": encode_image_url(fetch_image(image_url))}
video_url = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import requests
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.score.protocol import ScoreResponse
from vllm.multimodal.utils import encode_image_url, fetch_image
MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B"
HF_OVERRIDES = {
"architectures": ["Qwen3VLForSequenceClassification"],
"classifier_from_token": ["no", "yes"],
"is_original_qwen3_reranker": True,
}
query = "A cat standing in the snow."
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
documents = [
{
"type": "text",
"text": query,
},
{
"type": "image_url",
"image_url": {"url": image_url},
},
{
"type": "image_url",
"image_url": {"url": encode_image_url(fetch_image(image_url))},
},
]
@pytest.fixture(scope="module")
def server():
args = [
"--enforce-eager",
"--max-model-len",
"8192",
"--chat-template",
str(VLLM_PATH / "examples/pooling/score/template/qwen3_vl_reranker.jinja"),
]
with RemoteOpenAIServer(
MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
) as remote_server:
yield remote_server
def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer):
queries = "What is the capital of France?"
documents = "The capital of France is Paris."
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": queries,
"documents": documents,
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer):
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": query,
"documents": {"content": [documents[0]]},
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIServer):
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": query,
"documents": {"content": [documents[1]]},
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
def test_score_api_queries_str_documents_image_base64_content(
server: RemoteOpenAIServer,
):
score_response = requests.post(
server.url_for("score"),
json={
"model": MODEL_NAME,
"queries": query,
"documents": {"content": [documents[2]]},
},
)
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import requests
from vllm.entrypoints.utils import sanitize_message
......@@ -12,11 +8,3 @@ def test_sanitize_message():
sanitize_message("<_io.BytesIO object at 0x7a95e299e750>")
== "<_io.BytesIO object>"
)
def encode_base64_content_from_url(content_url: str) -> dict[str, str]:
with requests.get(content_url) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode("utf-8")
return {"url": f"data:image/jpeg;base64,{result}"}
......@@ -17,6 +17,8 @@ from vllm.model_executor.layers.activation import (
QuickGELU,
SiluAndMul,
SwigluOAIAndMul,
SwigluStepAndMul,
swiglustep_and_mul_triton,
)
from vllm.utils.torch_utils import set_random_seed
......@@ -36,6 +38,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e
"gelu_tanh",
"fatrelu",
"swigluoai_and_mul",
"swiglustep_and_mul",
],
)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
......@@ -75,9 +78,12 @@ def test_act_and_mul(
elif activation == "swigluoai_and_mul":
layer = SwigluOAIAndMul()
fn = torch.ops._C.swigluoai_and_mul
elif activation == "swiglustep_and_mul":
layer = SwigluStepAndMul()
fn = swiglustep_and_mul_triton
out = layer(x)
ref_out = layer.forward_native(x)
if activation == "swigluoai_and_mul":
if activation in ["swigluoai_and_mul", "swiglustep_and_mul"]:
rtol = {
# For fp16, change the relative tolerance from 1e-3 to 2e-3
torch.float16: 2e-3,
......@@ -104,7 +110,7 @@ def test_act_and_mul(
opcheck(fn, (out, x, threshold))
elif activation == "swigluoai_and_mul":
opcheck(fn, (out, x, layer.alpha, layer.limit))
else:
elif activation != "swiglustep_and_mul":
opcheck(fn, (out, x))
......
......@@ -715,7 +715,7 @@ def test_mixtral_moe(
# need to override the forward context for unittests, otherwise it assumes
# we're running the model forward pass (the model specified in vllm_config)
get_forward_context().remaining_moe_layers = None
get_forward_context().all_moe_layers = None
# Run forward passes for both MoE blocks
hf_states, _ = hf_moe.forward(hf_inputs)
......
......@@ -87,6 +87,13 @@ NKM_FACTORS_WVSPLITK_FP8 = [
SEEDS = [0]
def pad_weights_fp8(weight):
num_pad = 256 // weight.element_size()
import torch.nn.functional as F
return F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITKRC)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
......@@ -191,11 +198,12 @@ def test_rocm_wvsplitk_bias2D_kernel(n, k, m, dtype, seed):
@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("padded", [False, True])
@pytest.mark.skipif(
not (current_platform.is_rocm() and current_platform.supports_fp8()),
reason="only test for rocm fp8",
)
def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed, padded):
torch.manual_seed(seed)
A = torch.rand(n, k, device="cuda") - 0.5
......@@ -203,6 +211,8 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
if padded:
B = pad_weights_fp8(B)
ref_out = torch._scaled_mm(
A, B.t(), out_dtype=dtype, scale_a=scale_a, scale_b=scale_b
......@@ -222,11 +232,12 @@ def test_rocm_wvsplitk_fp8_kernel(n, k, m, dtype, seed):
@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("padded", [False, True])
@pytest.mark.skipif(
not (current_platform.is_rocm() and current_platform.supports_fp8()),
reason="only test for rocm fp8",
)
def test_rocm_wvsplitk_fp8_bias1D_kernel(n, k, m, dtype, seed):
def test_rocm_wvsplitk_fp8_bias1D_kernel(n, k, m, dtype, seed, padded):
torch.manual_seed(seed)
xavier = math.sqrt(2 / k) # normalize to avoid large output-bias deltas
......@@ -236,6 +247,8 @@ def test_rocm_wvsplitk_fp8_bias1D_kernel(n, k, m, dtype, seed):
A, scale_a = ref_dynamic_per_tensor_fp8_quant(A)
B, scale_b = ref_dynamic_per_tensor_fp8_quant(B)
if padded:
B = pad_weights_fp8(B)
ref_out = torch._scaled_mm(
A, B.t(), out_dtype=dtype, scale_a=scale_a, scale_b=scale_b, bias=BIAS
......
......@@ -479,6 +479,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Step1ForCausalLM": _HfExamplesInfo(
"stepfun-ai/Step-Audio-EditX", trust_remote_code=True
),
"Step3p5ForCausalLM": _HfExamplesInfo(
"stepfun-ai/step-3.5-flash", is_available_online=False
),
"SmolLM3ForCausalLM": _HfExamplesInfo("HuggingFaceTB/SmolLM3-3B"),
"StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"),
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
......@@ -1091,6 +1094,12 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
"Qwen3NextMTP": _HfExamplesInfo(
"Qwen/Qwen3-Next-80B-A3B-Instruct", min_transformers_version="4.56.3"
),
"Step3p5MTP": _HfExamplesInfo(
"stepfun-ai/Step-3.5-Flash",
trust_remote_code=True,
speculative_model="stepfun-ai/Step-3.5-Flash",
is_available_online=False,
),
}
_TRANSFORMERS_BACKEND_MODELS = {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment