Commit 8d75f22e authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori

parents ce888aa4 7d80c73d
name: vllm_ci
job_dirs:
- ".buildkite/test_areas"
- ".buildkite/image_build"
run_all_patterns:
- "docker/Dockerfile"
- "CMakeLists.txt"
- "requirements/common.txt"
- "requirements/cuda.txt"
- "requirements/build.txt"
- "requirements/test.txt"
- "setup.py"
- "csrc/"
- "cmake/"
run_all_exclude_patterns:
- "docker/Dockerfile."
- "csrc/cpu/"
- "csrc/rocm/"
- "cmake/hipify.py"
- "cmake/cpu_extension.cmake"
registries: public.ecr.aws/q9t5s3a7
repositories:
main: "vllm-ci-postmerge-repo"
premerge: "vllm-ci-test-repo"
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import os
template = """<!DOCTYPE html>
<html>
<body>
<h1>Links for vLLM</h1/>
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
</body>
</html>
"""
parser = argparse.ArgumentParser()
parser.add_argument("--wheel", help="The wheel path.", required=True)
args = parser.parse_args()
filename = os.path.basename(args.wheel)
with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}")
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
if "x86_64" in filename:
x86_wheel = filename
arm_wheel = filename.replace("x86_64", "aarch64").replace(
"manylinux1", "manylinux2014"
)
elif "aarch64" in filename:
x86_wheel = filename.replace("aarch64", "x86_64").replace(
"manylinux2014", "manylinux1"
)
arm_wheel = filename
else:
raise ValueError(f"Unsupported wheel: {filename}")
# cloudfront requires escaping the '+' character
f.write(
template.format(
x86_wheel=x86_wheel,
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
arm_wheel=arm_wheel,
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
)
)
#!/bin/bash
set -e
if [[ $# -lt 8 ]]; then
echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
BRANCH=$4
VLLM_USE_PRECOMPILED=$5
VLLM_MERGE_BASE_COMMIT=$6
CACHE_FROM=$7
CACHE_TO=$8
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
# docker buildx
docker buildx create --name vllm-builder --driver docker-container --use
docker buildx inspect --bootstrap
docker buildx ls
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
else
merge_base_commit_build_args=""
fi
# build
docker buildx build --file docker/Dockerfile \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg USE_SCCACHE=1 \
--build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
--build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
--build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
${merge_base_commit_build_args} \
--cache-from type=registry,ref=${CACHE_FROM},mode=max \
--cache-to type=registry,ref=${CACHE_TO},mode=max \
--tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
$( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
--push \
--target test \
--progress plain .
group: Abuild
steps:
- label: ":docker: Build image"
key: image-build
depends_on: []
commands:
- .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build CPU image"
key: image-build-cpu
depends_on: []
commands:
- .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build HPU image"
soft_fail: true
depends_on: []
key: image-build-hpu
commands:
- .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
- label: ":docker: Build CPU arm64 image"
key: cpu-arm64-image-build
depends_on: []
optional: true
commands:
- .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 2
- exit_status: -10 # Agent was lost
limit: 2
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--build-arg VLLM_CPU_AVX512BF16=true \
--build-arg VLLM_CPU_AVX512VNNI=true \
--build-arg VLLM_CPU_AMXBF16=true \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \
--progress plain .
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \
--target vllm-test \
--progress plain .
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu
#!/bin/bash
set -e
if [[ $# -lt 3 ]]; then
echo "Usage: $0 <registry> <repo> <commit>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
# skip build if image already exists
if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then
echo "Image not found, proceeding with build..."
else
echo "Image found"
exit 0
fi
# build
docker build \
--file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \
--build-arg max_jobs=16 \
--build-arg buildkite_commit=$BUILDKITE_COMMIT \
--tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \
--progress plain \
https://github.com/vllm-project/vllm-gaudi.git
# push
docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu
...@@ -8,3 +8,4 @@ tasks: ...@@ -8,3 +8,4 @@ tasks:
value: 0.80 value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5 num_fewshot: 5
rtol: 0.05
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
...@@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \ ...@@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
--tp-size=1 --tp-size=1
""" """
import os
from contextlib import contextmanager
import lm_eval import lm_eval
import numpy as np import numpy as np
import yaml import yaml
RTOL = 0.08 DEFAULT_RTOL = 0.08
@contextmanager
def scoped_env_vars(new_env: dict[str, str]):
if not new_env:
# Fast path: nothing to do
yield
return
old_values = {}
new_keys = []
try:
for key, value in new_env.items():
if key in os.environ:
old_values[key] = os.environ[key]
else:
new_keys.append(key)
os.environ[key] = str(value)
yield
finally:
# Restore / clean up
for key, value in old_values.items():
os.environ[key] = value
for key in new_keys:
os.environ.pop(key, None)
def launch_lm_eval(eval_config, tp_size): def launch_lm_eval(eval_config, tp_size):
...@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size): ...@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
f"trust_remote_code={trust_remote_code}," f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len}," f"max_model_len={max_model_len},"
) )
results = lm_eval.simple_evaluate(
model=backend, env_vars = eval_config.get("env_vars", None)
model_args=model_args, with scoped_env_vars(env_vars):
tasks=[task["name"] for task in eval_config["tasks"]], results = lm_eval.simple_evaluate(
num_fewshot=eval_config["num_fewshot"], model=backend,
limit=eval_config["limit"], model_args=model_args,
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help tasks=[task["name"] for task in eval_config["tasks"]],
# text models. however, this is regressing measured strict-match for num_fewshot=eval_config["num_fewshot"],
# existing text models in CI, so only apply it for mm, or explicitly set limit=eval_config["limit"],
apply_chat_template=eval_config.get( # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
"apply_chat_template", backend == "vllm-vlm" # text models. however, this is regressing measured strict-match for
), # existing text models in CI, so only apply it for mm, or explicitly set
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), apply_chat_template=eval_config.get(
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) "apply_chat_template", backend == "vllm-vlm"
gen_kwargs=eval_config.get("gen_kwargs"), ),
batch_size=batch_size, fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
) # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
gen_kwargs=eval_config.get("gen_kwargs"),
batch_size=batch_size,
)
return results return results
...@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size): ...@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
results = launch_lm_eval(eval_config, tp_size) results = launch_lm_eval(eval_config, tp_size)
rtol = eval_config.get("rtol", DEFAULT_RTOL)
success = True success = True
for task in eval_config["tasks"]: for task in eval_config["tasks"]:
for metric in task["metrics"]: for metric in task["metrics"]:
...@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size): ...@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
measured_value = results["results"][task["name"]][metric["name"]] measured_value = results["results"][task["name"]][metric["name"]]
print( print(
f"{task['name']} | {metric['name']}: " f"{task['name']} | {metric['name']}: "
f"ground_truth={ground_truth} | measured={measured_value}" f"ground_truth={ground_truth:.3f} | "
f"measured={measured_value:.3f} | rtol={rtol}"
) )
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL) success = success and np.isclose(ground_truth, measured_value, rtol=rtol)
assert success assert success
...@@ -7,18 +7,21 @@ ...@@ -7,18 +7,21 @@
import argparse import argparse
import json import json
import re
import sys import sys
from dataclasses import asdict, dataclass from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from urllib.parse import quote from urllib.parse import quote
import regex as re
if not sys.version_info >= (3, 12): if not sys.version_info >= (3, 12):
raise RuntimeError("This script requires Python 3.12 or higher.") raise RuntimeError("This script requires Python 3.12 or higher.")
INDEX_HTML_TEMPLATE = """<!DOCTYPE html> INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
<html> <html>
<!-- {comment} -->
<meta name="pypi:repository-version" content="1.0"> <meta name="pypi:repository-version" content="1.0">
<body> <body>
{items} {items}
...@@ -89,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo: ...@@ -89,7 +92,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
) )
def generate_project_list(subdir_names: list[str]) -> str: def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
""" """
Generate project list HTML content linking to each project & variant sub-directory. Generate project list HTML content linking to each project & variant sub-directory.
""" """
...@@ -97,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str: ...@@ -97,11 +100,14 @@ def generate_project_list(subdir_names: list[str]) -> str:
for name in sorted(subdir_names): for name in sorted(subdir_names):
name = name.strip("/").strip(".") name = name.strip("/").strip(".")
href_tags.append(f' <a href="{name}/">{name}/</a><br/>') href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags)) return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
def generate_package_index_and_metadata( def generate_package_index_and_metadata(
wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path wheel_files: list[WheelFileInfo],
wheel_base_dir: Path,
index_base_dir: Path,
comment: str = "",
) -> tuple[str, str]: ) -> tuple[str, str]:
""" """
Generate package index HTML content for a specific package, linking to actual wheel files. Generate package index HTML content for a specific package, linking to actual wheel files.
...@@ -119,7 +125,7 @@ def generate_package_index_and_metadata( ...@@ -119,7 +125,7 @@ def generate_package_index_and_metadata(
file_meta = asdict(file) file_meta = asdict(file)
file_meta["path"] = file_path_quoted file_meta["path"] = file_path_quoted
metadata.append(file_meta) metadata.append(file_meta)
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags)) index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment)
metadata_str = json.dumps(metadata, indent=2) metadata_str = json.dumps(metadata, indent=2)
return index_str, metadata_str return index_str, metadata_str
...@@ -130,6 +136,7 @@ def generate_index_and_metadata( ...@@ -130,6 +136,7 @@ def generate_index_and_metadata(
index_base_dir: Path, index_base_dir: Path,
default_variant: str | None = None, default_variant: str | None = None,
alias_to_default: str | None = None, alias_to_default: str | None = None,
comment: str = "",
): ):
""" """
Generate index for all wheel files. Generate index for all wheel files.
...@@ -140,6 +147,7 @@ def generate_index_and_metadata( ...@@ -140,6 +147,7 @@ def generate_index_and_metadata(
index_base_dir (Path): Base directory to store index files. index_base_dir (Path): Base directory to store index files.
default_variant (str | None): The default variant name, if any. default_variant (str | None): The default variant name, if any.
alias_to_default (str | None): Alias variant name for the default variant, if any. alias_to_default (str | None): Alias variant name for the default variant, if any.
comment (str | None): Optional comment to include in the generated HTML files.
First, parse all wheel files to extract metadata. First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory). We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
...@@ -233,6 +241,10 @@ def generate_index_and_metadata( ...@@ -233,6 +241,10 @@ def generate_index_and_metadata(
variant_to_files[alias_to_default] = variant_to_files["default"].copy() variant_to_files[alias_to_default] = variant_to_files["default"].copy()
print(f"Alias variant '{alias_to_default}' created for default variant.") print(f"Alias variant '{alias_to_default}' created for default variant.")
# Generate comment in HTML header
comment_str = f" ({comment})" if comment else ""
comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}"
# Generate index for each variant # Generate index for each variant
subdir_names = set() subdir_names = set()
for variant, files in variant_to_files.items(): for variant, files in variant_to_files.items():
...@@ -252,7 +264,7 @@ def generate_index_and_metadata( ...@@ -252,7 +264,7 @@ def generate_index_and_metadata(
subdir_names = subdir_names.union(packages) subdir_names = subdir_names.union(packages)
else: else:
# generate project list for this variant directly # generate project list for this variant directly
project_list_str = generate_project_list(sorted(packages)) project_list_str = generate_project_list(sorted(packages), comment_tmpl)
with open(variant_dir / "index.html", "w") as f: with open(variant_dir / "index.html", "w") as f:
f.write(project_list_str) f.write(project_list_str)
...@@ -262,7 +274,7 @@ def generate_index_and_metadata( ...@@ -262,7 +274,7 @@ def generate_index_and_metadata(
package_dir = variant_dir / package package_dir = variant_dir / package
package_dir.mkdir(parents=True, exist_ok=True) package_dir.mkdir(parents=True, exist_ok=True)
index_str, metadata_str = generate_package_index_and_metadata( index_str, metadata_str = generate_package_index_and_metadata(
package_files, wheel_base_dir, package_dir package_files, wheel_base_dir, package_dir, comment
) )
with open(package_dir / "index.html", "w") as f: with open(package_dir / "index.html", "w") as f:
f.write(index_str) f.write(index_str)
...@@ -270,7 +282,7 @@ def generate_index_and_metadata( ...@@ -270,7 +282,7 @@ def generate_index_and_metadata(
f.write(metadata_str) f.write(metadata_str)
# Generate top-level project list index # Generate top-level project list index
project_list_str = generate_project_list(sorted(subdir_names)) project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl)
with open(index_base_dir / "index.html", "w") as f: with open(index_base_dir / "index.html", "w") as f:
f.write(project_list_str) f.write(project_list_str)
...@@ -282,6 +294,7 @@ if __name__ == "__main__": ...@@ -282,6 +294,7 @@ if __name__ == "__main__":
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
--output-dir <output_directory> : directory to store generated index files --output-dir <output_directory> : directory to store generated index files
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
--comment <comment_string> : (optional) comment string to include in generated HTML files
""" """
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
...@@ -311,6 +324,12 @@ if __name__ == "__main__": ...@@ -311,6 +324,12 @@ if __name__ == "__main__":
default=None, default=None,
help="Alias variant name for the default variant", help="Alias variant name for the default variant",
) )
parser.add_argument(
"--comment",
type=str,
default="",
help="Optional comment string to include in generated HTML files",
)
args = parser.parse_args() args = parser.parse_args()
...@@ -365,5 +384,6 @@ if __name__ == "__main__": ...@@ -365,5 +384,6 @@ if __name__ == "__main__":
index_base_dir=index_base_dir, index_base_dir=index_base_dir,
default_variant=None, default_variant=None,
alias_to_default=args.alias_to_default, alias_to_default=args.alias_to_default,
comment=args.comment.strip(),
) )
print(f"Successfully generated index and metadata in {output_dir}") print(f"Successfully generated index and metadata in {output_dir}")
...@@ -40,7 +40,8 @@ function cpu_tests() { ...@@ -40,7 +40,8 @@ function cpu_tests() {
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
set -e set -e
pytest -x -v -s tests/kernels/test_onednn.py pytest -x -v -s tests/kernels/test_onednn.py
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py" pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
# basic online serving # basic online serving
docker exec cpu-test bash -c ' docker exec cpu-test bash -c '
......
...@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME} ...@@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME}
# Define environments # Define environments
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV SOC_VERSION="ascend910b1"
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \ RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \ pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
......
...@@ -38,6 +38,7 @@ docker run \ ...@@ -38,6 +38,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests cd tests
pytest -v -s v1/core pytest -v -s v1/core
...@@ -46,6 +47,6 @@ docker run \ ...@@ -46,6 +47,6 @@ docker run \
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
pytest -v -s v1/test_serial_utils.py pytest -v -s v1/test_serial_utils.py
' '
...@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" ...@@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git" PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
PRIME_RL_DIR="${REPO_ROOT}/prime-rl" PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then
echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..."
exit 0
fi
echo "Setting up Prime-RL integration test environment..." echo "Setting up Prime-RL integration test environment..."
# Clean up any existing Prime-RL directory # Clean up any existing Prime-RL directory
......
#!/usr/bin/env bash
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8030}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="deepseek-ai/DeepSeek-V2-lite"
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 2 \
--data-parallel-size 2 \
--enable-expert-parallel \
--enable-eplb \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--trust-remote-code \
--max-model-len 2048 \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
PY
cleanup
SERVER_PID=
sleep 1
PORT=$((PORT+1))
done
...@@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do ...@@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do
--data-parallel-size 2 \ --data-parallel-size 2 \
--enable-expert-parallel \ --enable-expert-parallel \
--enable-eplb \ --enable-eplb \
--eplb-config '{"window_size":200,"step_interval":600}' \
--trust-remote-code \ --trust-remote-code \
--max-model-len 2048 \ --max-model-len 2048 \
--port $PORT & --port $PORT &
......
#!/usr/bin/env bash
set -euxo pipefail
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
THRESHOLD=${1:-0.25}
NUM_Q=${2:-1319}
PORT=${3:-8040}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
wait_for_server() {
local port=$1
timeout 600 bash -c '
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
sleep 1
done'
}
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
# Set BACKENDS based on platform
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
# ROCm platform
BACKENDS=("allgather_reducescatter")
# Disable MOE padding for ROCm since it is causing eplb to fail
export VLLM_ROCM_MOE_PADDING=0
else
# Non-ROCm platform (CUDA/other)
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
fi
cleanup() {
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" 2>/dev/null || true
for _ in {1..20}; do
kill -0 "${SERVER_PID}" 2>/dev/null || break
sleep 0.5
done
kill -9 "${SERVER_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 4 \
--enable-expert-parallel \
--enable-eplb \
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
--trust-remote-code \
--max-model-len 2048 \
--gpu-memory-utilization 0.9 \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
python3 - <<PY
import json; acc=json.load(open('${OUT}'))['accuracy']
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
PY
cleanup
SERVER_PID=
sleep 1
PORT=$((PORT+1))
done
...@@ -81,7 +81,10 @@ else ...@@ -81,7 +81,10 @@ else
alias_arg="" alias_arg=""
fi fi
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg # HACK: we do not need regex module here, but it is required by pre-commit hook
# To avoid any external dependency, we simply replace it back to the stdlib re module
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg
# copy indices to /<commit>/ unconditionally # copy indices to /<commit>/ unconditionally
echo "Uploading indices to $S3_COMMIT_PREFIX" echo "Uploading indices to $S3_COMMIT_PREFIX"
......
...@@ -398,7 +398,8 @@ steps: ...@@ -398,7 +398,8 @@ steps:
timeout_in_minutes: 25 timeout_in_minutes: 25
gpu: h100 gpu: h100
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/v1/attention
- vllm/model_executor/layers
- tests/v1/determinism/ - tests/v1/determinism/
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
...@@ -440,23 +441,29 @@ steps: ...@@ -440,23 +441,29 @@ steps:
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
source_file_dependencies: source_file_dependencies:
- vllm/entrypoints - vllm/entrypoints
- vllm/multimodal
- examples/ - examples/
commands: commands:
- pip install tensorizer # for tensorizer test - pip install tensorizer # for tensorizer test
# for basic
- python3 offline_inference/basic/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/chat.py - python3 offline_inference/basic/classify.py
- python3 offline_inference/prefix_caching.py - python3 offline_inference/basic/embed.py
- python3 offline_inference/llm_engine_example.py - python3 offline_inference/basic/score.py
# for multi-modal models
- python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py # for pooling models
- python3 offline_inference/basic/embed.py - python3 pooling/pooling/vision_language_pooling.py --seed 0
- python3 offline_inference/basic/score.py # for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
...@@ -715,16 +722,18 @@ steps: ...@@ -715,16 +722,18 @@ steps:
# we can only upgrade after this is resolved # we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment # TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 - uv pip install --system torchao==0.13.0
- uv pip install --system conch-triton-kernels
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 15min - label: LM Eval Small Models # 53min
timeout_in_minutes: 20 timeout_in_minutes: 75
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
autorun_on_main: true
commands: commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
...@@ -737,7 +746,7 @@ steps: ...@@ -737,7 +746,7 @@ steps:
- csrc/ - csrc/
- vllm/entrypoints/openai/ - vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py - vllm/model_executor/models/whisper.py
commands: # LMEval commands: # LMEval+Transcription WER check
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
- pytest -s entrypoints/openai/correctness/ - pytest -s entrypoints/openai/correctness/
...@@ -934,6 +943,18 @@ steps: ...@@ -934,6 +943,18 @@ steps:
commands: commands:
- pytest -v -s models/language/pooling_mteb_test - pytest -v -s models/language/pooling_mteb_test
- label: Multi-Modal Processor Test (CPU)
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
source_file_dependencies:
- vllm/
- tests/models/multimodal
no_gpu: true
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
- label: Multi-Modal Processor Test # 44min - label: Multi-Modal Processor Test # 44min
timeout_in_minutes: 60 timeout_in_minutes: 60
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -961,8 +982,8 @@ steps: ...@@ -961,8 +982,8 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 10min - label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min
timeout_in_minutes: 70 timeout_in_minutes: 180
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
...@@ -974,7 +995,8 @@ steps: ...@@ -974,7 +995,8 @@ steps:
commands: commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1 - label: Multi-Modal Models Test (Extended) 1 # 60min
timeout_in_minutes: 120
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
...@@ -998,7 +1020,8 @@ steps: ...@@ -998,7 +1020,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
- label: Multi-Modal Models Test (Extended) 3 - label: Multi-Modal Models Test (Extended) 3 # 75min
timeout_in_minutes: 150
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
...@@ -1107,7 +1130,6 @@ steps: ...@@ -1107,7 +1130,6 @@ steps:
- vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py - vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py - vllm/model_executor/layers/quantization/input_quant_fp8.py
- vllm/model_executor/layers/fused_moe/layer.py
- tests/compile/test_fusion_attn.py - tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py - tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py - tests/compile/distributed/test_fusion_all_reduce.py
...@@ -1141,17 +1163,15 @@ steps: ...@@ -1141,17 +1163,15 @@ steps:
- vllm/model_executor/layers/activation.py - vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py - vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py - tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands: commands:
- nvidia-smi - nvidia-smi
# Run all e2e fusion tests # Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
- label: ROCm GPT-OSS Eval - label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60 timeout_in_minutes: 60
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
agent_pool: mi325_1 gpu: b200
mirror_hardwares: [amdexperimental, amdproduction]
optional: true # run on nightlies optional: true # run on nightlies
source_file_dependencies: source_file_dependencies:
- tests/evals/gpt_oss - tests/evals/gpt_oss
...@@ -1160,7 +1180,7 @@ steps: ...@@ -1160,7 +1180,7 @@ steps:
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
commands: commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5' - uv pip install --system 'gpt-oss[eval]==0.0.5'
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
- label: Blackwell Quantized MoE Test - label: Blackwell Quantized MoE Test
timeout_in_minutes: 60 timeout_in_minutes: 60
...@@ -1365,7 +1385,7 @@ steps: ...@@ -1365,7 +1385,7 @@ steps:
- pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py - pytest -v -s -x lora/test_olmoe_tp.py
# Disabled for now because MXFP4 backend on non-cuda platform # Disabled for now because MXFP4 backend on non-cuda platform
# doesn't support LoRA yet # doesn't support LoRA yet
#- pytest -v -s -x lora/test_gptoss_tp.py #- pytest -v -s -x lora/test_gptoss_tp.py
...@@ -1431,12 +1451,13 @@ steps: ...@@ -1431,12 +1451,13 @@ steps:
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional - label: LM Eval Large Models # optional
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: a100 gpu: a100
optional: true optional: true
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
num_gpus: 4 num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
...@@ -1448,11 +1469,11 @@ steps: ...@@ -1448,11 +1469,11 @@ steps:
##### H100 test ##### ##### H100 test #####
- label: LM Eval Large Models (H100) # optional - label: LM Eval Large Models (H100) # optional
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: h100 gpu: h100
optional: true optional: true
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
num_gpus: 4 num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
...@@ -1462,6 +1483,7 @@ steps: ...@@ -1462,6 +1483,7 @@ steps:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
##### H200 test ##### ##### H200 test #####
- label: Distributed Tests (H200) # optional - label: Distributed Tests (H200) # optional
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -1472,14 +1494,14 @@ steps: ...@@ -1472,14 +1494,14 @@ steps:
working_dir: "/vllm-workspace/" working_dir: "/vllm-workspace/"
num_gpus: 2 num_gpus: 2
commands: commands:
- pytest -v -s tests/compile/distributed/test_async_tp.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- pytest -v -s tests/distributed/test_sequence_parallel.py - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test ##### ##### B200 test #####
...@@ -1493,6 +1515,57 @@ steps: ...@@ -1493,6 +1515,57 @@ steps:
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
- pytest -v -s tests/v1/distributed/test_dbo.py - pytest -v -s tests/v1/distributed/test_dbo.py
##### E2E Eval Tests #####
- label: LM Eval Small Models (1 Card) # 15min
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: LM Eval Large Models (4 Card)
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4
# grade: Blocking
gpu: a100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: ROCm LM Eval Large Models (8 Card)
mirror_hardwares: [amdproduction]
agent_pool: mi325_8
num_gpus: 8
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
- label: ROCm GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
agent_pool: mi325_1
mirror_hardwares: [amdexperimental, amdproduction]
optional: true # run on nightlies
source_file_dependencies:
- tests/evals/gpt_oss
- vllm/model_executor/models/gpt_oss.py
- vllm/model_executor/layers/quantization/mxfp4.py
- vllm/v1/attention/backends/flashinfer.py
commands:
- uv pip install --system 'gpt-oss[eval]==0.0.5'
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
##### RL Integration Tests ##### ##### RL Integration Tests #####
- label: Prime-RL Integration Test # 15min - label: Prime-RL Integration Test # 15min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -1507,7 +1580,6 @@ steps: ...@@ -1507,7 +1580,6 @@ steps:
- .buildkite/scripts/run-prime-rl-test.sh - .buildkite/scripts/run-prime-rl-test.sh
commands: commands:
- bash .buildkite/scripts/run-prime-rl-test.sh - bash .buildkite/scripts/run-prime-rl-test.sh
- label: DeepSeek V2-Lite Accuracy - label: DeepSeek V2-Lite Accuracy
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_4 agent_pool: mi325_4
...@@ -1539,4 +1611,28 @@ steps: ...@@ -1539,4 +1611,28 @@ steps:
num_gpus: 2 num_gpus: 2
working_dir: "/vllm-workspace" working_dir: "/vllm-workspace"
commands: commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
- label: DeepSeek V2-Lite Async EPLB Accuracy
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
agent_pool: mi325_4
# grade: Blocking
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment