Commit a99300bd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc1' into v0.10.2rc1-dev

parents cc3e01c7 5438967f
...@@ -8,7 +8,8 @@ template = """<!DOCTYPE html> ...@@ -8,7 +8,8 @@ template = """<!DOCTYPE html>
<html> <html>
<body> <body>
<h1>Links for vLLM</h1/> <h1>Links for vLLM</h1/>
<a href="../{wheel_html_escaped}">{wheel}</a><br/> <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
</body> </body>
</html> </html>
""" """
...@@ -21,7 +22,25 @@ filename = os.path.basename(args.wheel) ...@@ -21,7 +22,25 @@ filename = os.path.basename(args.wheel)
with open("index.html", "w") as f: with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}") print(f"Generated index.html for {args.wheel}")
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
if "x86_64" in filename:
x86_wheel = filename
arm_wheel = filename.replace("x86_64", "aarch64").replace(
"manylinux1", "manylinux2014"
)
elif "aarch64" in filename:
x86_wheel = filename.replace("aarch64", "x86_64").replace(
"manylinux2014", "manylinux1"
)
arm_wheel = filename
else:
raise ValueError(f"Unsupported wheel: {filename}")
# cloudfront requires escaping the '+' character # cloudfront requires escaping the '+' character
f.write( f.write(
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B")) template.format(
x86_wheel=x86_wheel,
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
arm_wheel=arm_wheel,
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
)
) )
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.419
- name: "exact_match,flexible-extract"
value: 0.416
limit: 1000
num_fewshot: 5
...@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml ...@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml DeepSeek-V2-Lite-Chat.yaml
Meta-Llama-3-8B-QQQ.yaml
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers. # We can use this script to compute baseline accuracy on GSM for transformers.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.4 # pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() { usage() {
echo`` echo``
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support. # We use this for fp8, which HF does not support.
# #
# Make sure you have lm-eval-harness installed: # Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.4 # pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() { usage() {
echo`` echo``
......
...@@ -141,7 +141,7 @@ When run, benchmark script generates results under `benchmark/results` folder, a ...@@ -141,7 +141,7 @@ When run, benchmark script generates results under `benchmark/results` folder, a
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead. If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps. Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | | | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
......
...@@ -17,7 +17,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/ ...@@ -17,7 +17,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
- SGLang: `lmsysorg/sglang:v0.3.2-cu121` - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.* - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware - Hardware
- 8x Nvidia A100 GPUs - 8x Nvidia A100 GPUs
......
...@@ -3,44 +3,129 @@ ...@@ -3,44 +3,129 @@
import argparse import argparse
import json import json
import os import os
from importlib import util
import pandas as pd import pandas as pd
plotly_found = util.find_spec("plotly.express") is not None
def compare_data_columns( def compare_data_columns(
files, name_column, data_column, info_cols, drop_column, debug=False files, name_column, data_column, info_cols, drop_column, debug=False
): ):
print("\ncompare_data_column: " + data_column) """
Align concatenation by keys derived from info_cols instead of row order.
- Pick one canonical key list: subset of info_cols present in ALL files.
- For each file: set index to those keys, aggregate duplicates
- (mean for metric, first for names).
- Concat along axis=1 (indexes align), then reset_index so callers can
- group by columns.
- If --debug, add a <file_label>_name column per file.
"""
print("\ncompare_data_column:", data_column)
frames = [] frames = []
raw_data_cols = [] raw_data_cols = []
compare_frames = [] compare_frames = []
# 1) choose a canonical key list from info_cols that exists in ALL files
cols_per_file = []
for f in files:
try:
df_tmp = pd.read_json(f, orient="records")
except Exception as err:
raise ValueError(f"Failed to read {f}") from err
cols_per_file.append(set(df_tmp.columns))
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
if not key_cols:
# soft fallback: use any info_cols present in the first file
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
if not key_cols:
raise ValueError(
"No common key columns found from info_cols across the input files."
)
# 2) build a single "meta" block (keys as columns) once, aligned by the key index
meta_added = False
for file in files: for file in files:
data_df = pd.read_json(file) df = pd.read_json(file, orient="records")
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
# Show all info columns in the first couple columns # Keep rows that actually have the compared metric (same as original behavior)
if not frames: if drop_column in df.columns:
for col in info_cols: df = df.dropna(subset=[drop_column], ignore_index=True)
if col not in serving_df.columns:
print(f"Skipping missing column: {col}") # Stabilize numeric key columns (harmless if missing)
continue for c in (
frames.append(serving_df[col]) "Input Len",
# only show test name under debug mode "Output Len",
if debug is True: "TP Size",
serving_df = serving_df.rename(columns={name_column: file + "_name"}) "PP Size",
frames.append(serving_df[file + "_name"]) "# of max concurrency.",
"qps",
file = "/".join(file.split("/")[:-1]) ):
serving_df = serving_df.rename(columns={data_column: file}) if c in df.columns:
frames.append(serving_df[file]) df[c] = pd.to_numeric(df[c], errors="coerce")
raw_data_cols.append(file)
compare_frames.append(serving_df[file]) # Ensure all key columns exist
for c in key_cols:
if c not in df.columns:
df[c] = pd.NA
# Set index = key_cols and aggregate duplicates → unique MultiIndex
df_idx = df.set_index(key_cols, drop=False)
# meta (key columns), unique per key
meta = df_idx[key_cols]
if not meta.index.is_unique:
meta = meta.groupby(level=key_cols, dropna=False).first()
# metric series for this file, aggregated to one row per key
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
s = df_idx[data_column]
if not s.index.is_unique:
s = s.groupby(level=key_cols, dropna=False).mean()
s.name = file_label # column label like original
# add meta once (from first file) so keys are the leftmost columns
if not meta_added:
frames.append(meta)
meta_added = True
# (NEW) debug: aligned test-name column per file
if debug and name_column in df_idx.columns:
name_s = df_idx[name_column]
if not name_s.index.is_unique:
name_s = name_s.groupby(level=key_cols, dropna=False).first()
name_s.name = f"{file_label}_name"
frames.append(name_s)
frames.append(s)
raw_data_cols.append(file_label)
compare_frames.append(s)
# Generalize ratio: for any file N>=2, add ratio (fileN / file1)
if len(compare_frames) >= 2: if len(compare_frames) >= 2:
# Compare numbers among two files base = compare_frames[0]
ratio_df = compare_frames[1] / compare_frames[0] current = compare_frames[-1]
frames.append(ratio_df) ratio = current / base
compare_frames.pop(1) ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
frames.append(ratio)
# 4) concat on columns with aligned MultiIndex;
# then reset_index to return keys as columns
concat_df = pd.concat(frames, axis=1) concat_df = pd.concat(frames, axis=1)
concat_df = concat_df.reset_index(drop=True).reset_index()
if "index" in concat_df.columns:
concat_df = concat_df.drop(columns=["index"])
# Ensure key/info columns appear first (in your info_cols order)
front = [c for c in info_cols if c in concat_df.columns]
rest = [c for c in concat_df.columns if c not in front]
concat_df = concat_df[front + rest]
print(raw_data_cols) print(raw_data_cols)
return concat_df, raw_data_cols return concat_df, raw_data_cols
...@@ -67,6 +152,15 @@ def split_json_by_tp_pp( ...@@ -67,6 +152,15 @@ def split_json_by_tp_pp(
df = pd.DataFrame(data) df = pd.DataFrame(data)
# Keep only "serving" tests
name_col = next(
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
)
if name_col:
df = df[
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
].copy()
# Handle alias column names # Handle alias column names
rename_map = { rename_map = {
"tp_size": "TP Size", "tp_size": "TP Size",
...@@ -181,7 +275,6 @@ if __name__ == "__main__": ...@@ -181,7 +275,6 @@ if __name__ == "__main__":
f"Expected subset: {filtered_info_cols}, " f"Expected subset: {filtered_info_cols}, "
f"but DataFrame has: {list(output_df.columns)}" f"but DataFrame has: {list(output_df.columns)}"
) )
output_df_sorted = output_df.sort_values(by=existing_group_cols) output_df_sorted = output_df.sort_values(by=existing_group_cols)
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
for name, group in output_groups: for name, group in output_groups:
...@@ -189,8 +282,7 @@ if __name__ == "__main__": ...@@ -189,8 +282,7 @@ if __name__ == "__main__":
text_file.write(html_msgs_for_data_cols[i]) text_file.write(html_msgs_for_data_cols[i])
text_file.write(html) text_file.write(html)
if plot is True: if plot and plotly_found:
import pandas as pd
import plotly.express as px import plotly.express as px
df = group[raw_data_cols] df = group[raw_data_cols]
......
...@@ -382,7 +382,7 @@ run_genai_perf_tests() { ...@@ -382,7 +382,7 @@ run_genai_perf_tests() {
client_command="genai-perf profile \ client_command="genai-perf profile \
-m $model \ -m $model \
--service-kind openai \ --service-kind openai \
--backend vllm \ --backend "$backend" \
--endpoint-type chat \ --endpoint-type chat \
--streaming \ --streaming \
--url localhost:$port \ --url localhost:$port \
......
...@@ -7,7 +7,7 @@ steps: ...@@ -7,7 +7,7 @@ steps:
commands: commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh" - "bash .buildkite/scripts/upload-wheels.sh"
...@@ -27,7 +27,12 @@ steps: ...@@ -27,7 +27,12 @@ steps:
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- block: "Build CUDA 12.6 wheel"
key: block-build-cu126-wheel
depends_on: ~
- label: "Build wheel - CUDA 12.6" - label: "Build wheel - CUDA 12.6"
depends_on: block-build-cu126-wheel
id: build-wheel-cuda-12-6 id: build-wheel-cuda-12-6
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
...@@ -57,23 +62,45 @@ steps: ...@@ -57,23 +62,45 @@ steps:
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- block: "Build release image" - label: "Build release image (x86)"
depends_on: ~ depends_on: ~
key: block-release-image-build id: build-release-image-x86
- label: "Build release image"
depends_on: block-release-image-build
id: build-release-image
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build release image (arm64)"
depends_on: ~
id: build-release-image-arm64
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest
- label: "Create multi-arch manifest"
depends_on:
- build-release-image-x86
- build-release-image-arm64
id: create-multi-arch-manifest
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow" - label: "Annotate release workflow"
depends_on: depends_on:
- build-release-image - create-multi-arch-manifest
- build-wheel-cuda-12-8 - build-wheel-cuda-12-8
- build-wheel-cuda-12-6 - build-wheel-cuda-12-6
- build-wheel-cuda-11-8 - build-wheel-cuda-11-8
......
...@@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then ...@@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_chat.py \ --ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \ --ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \ --ignore=entrypoints/llm/test_init.py \
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
--ignore=entrypoints/llm/test_prompt_validation.py "} --ignore=entrypoints/llm/test_prompt_validation.py "}
fi fi
......
...@@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE ...@@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() { function cpu_tests() {
set -e set -e
...@@ -46,21 +46,26 @@ function cpu_tests() { ...@@ -46,21 +46,26 @@ function cpu_tests() {
set -e set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run kernel tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test # Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
# Note: disable until supports V1 # Note: disable until supports V1
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
# Note: disable Bart until supports V1 # Note: disable Bart until supports V1
pytest -v -s tests/models/language/generation -m cpu_model \ pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py --ignore=tests/models/language/generation/test_bart.py
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \ VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py --ignore=tests/models/language/generation/test_bart.py
pytest -v -s tests/models/language/pooling -m cpu_model pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -v -s tests/models/multimodal/generation \ pytest -x -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \ --ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \ --ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model" -m cpu_model"
...@@ -68,35 +73,51 @@ function cpu_tests() { ...@@ -68,35 +73,51 @@ function cpu_tests() {
# Run compressed-tensor test # Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
# Note: disable it until supports V1 # Note: disable it until supports V1
# Run AWQ test # Run AWQ test
docker exec cpu-test-"$NUMA_NODE" bash -c " # docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e # set -e
VLLM_USE_V1=0 pytest -s -v \ # VLLM_USE_V1=0 pytest -x -s -v \
tests/quantization/test_ipex_quant.py" # tests/quantization/test_ipex_quant.py"
# Run multi-lora tests # Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -x -s -v \
tests/lora/test_qwen2vl.py" tests/lora/test_qwen2vl.py"
# online serving # online serving: tp+pp
docker exec cpu-test-"$NUMA_NODE" bash -c ' docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
# online serving: tp+dp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \ vllm bench serve \
--backend vllm \ --backend vllm \
--dataset-name random \ --dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \ --model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \ --num-prompts 20 \
--endpoint /v1/completions' --endpoint /v1/completions
kill -s SIGTERM $server_pid &'
} }
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests export -f cpu_tests
timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
...@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" ...@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer && python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1 export VLLM_USE_V1=1
......
...@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" ...@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---" echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer && python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---" echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1 export VLLM_USE_V1=1
......
...@@ -23,12 +23,15 @@ docker run \ ...@@ -23,12 +23,15 @@ docker run \
--device /dev/dri \ --device /dev/dri \
-v /dev/dri/by-path:/dev/dri/by-path \ -v /dev/dri/by-path:/dev/dri/by-path \
--entrypoint="" \ --entrypoint="" \
-e "HF_TOKEN=${HF_TOKEN}" \
-e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
--name "${container_name}" \ --name "${container_name}" \
"${image_name}" \ "${image_name}" \
sh -c ' bash -c '
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m set -e
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2 echo $ZE_AFFINITY_MASK
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
cd tests cd tests
...@@ -37,8 +40,8 @@ docker run \ ...@@ -37,8 +40,8 @@ docker run \
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/test_serial_utils.py pytest -v -s v1/test_serial_utils.py
pytest -v -s v1/test_utils.py pytest -v -s v1/test_utils.py
pytest -v -s v1/test_metrics_reader.py pytest -v -s v1/test_metrics_reader.py
......
...@@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then ...@@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then
# Remove dangling images (those that are not tagged and not used by any container) # Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f docker image prune -f
# Remove unused volumes / force the system prune for old images as well. # Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all docker volume prune -f && docker system prune --force --filter "until=24h" --all
echo "Docker images and volumes cleanup completed." echo "Docker images and volumes cleanup completed."
else else
echo "Disk usage is below $threshold%. No cleanup needed." echo "Disk usage is below $threshold%. No cleanup needed."
......
...@@ -14,8 +14,19 @@ fi ...@@ -14,8 +14,19 @@ fi
# Get the single wheel file # Get the single wheel file
wheel="${wheel_files[0]}" wheel="${wheel_files[0]}"
# Rename 'linux' to 'manylinux1' in the wheel filename # Detect architecture and rename 'linux' to appropriate manylinux version
new_wheel="${wheel/linux/manylinux1}" arch=$(uname -m)
if [[ $arch == "x86_64" ]]; then
manylinux_version="manylinux1"
elif [[ $arch == "aarch64" ]]; then
manylinux_version="manylinux2014"
else
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
manylinux_version="manylinux1"
fi
# Rename 'linux' to the appropriate manylinux version in the wheel filename
new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel" mv -- "$wheel" "$new_wheel"
wheel="$new_wheel" wheel="$new_wheel"
......
...@@ -88,15 +88,6 @@ steps: ...@@ -88,15 +88,6 @@ steps:
- pytest -v -s basic_correctness/test_cpu_offload.py - pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_chunked_prefill
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- label: Core Test # 10min - label: Core Test # 10min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
fast_check: true fast_check: true
...@@ -118,10 +109,9 @@ steps: ...@@ -118,10 +109,9 @@ steps:
- tests/entrypoints/offline_mode - tests/entrypoints/offline_mode
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Test (API Server) # 40min - label: Entrypoints Test (API Server) # 40min
...@@ -135,7 +125,8 @@ steps: ...@@ -135,7 +125,8 @@ steps:
- tests/entrypoints/test_chat_utils - tests/entrypoints/test_chat_utils
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- label: Distributed Tests (4 GPUs) # 10min - label: Distributed Tests (4 GPUs) # 10min
...@@ -242,16 +233,34 @@ steps: ...@@ -242,16 +233,34 @@ steps:
# OOM in the CI unless we run this separately # OOM in the CI unless we run this separately
- pytest -v -s tokenization - pytest -v -s tokenization
- label: V1 Test - label: V1 Test e2e + engine
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/v1 - tests/v1
commands: commands:
# split the test to avoid interference # TODO: accuracy does not match, whether setting
- pytest -v -s v1/core # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
- pytest -v -s v1/engine - pytest -v -s v1/engine
- label: V1 Test entrypoints
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/v1
commands:
- pytest -v -s v1/entrypoints - pytest -v -s v1/entrypoints
- label: V1 Test others
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/v1
commands:
# split the test to avoid interference
- pytest -v -s v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/sample - pytest -v -s v1/sample
- pytest -v -s v1/logits_processors - pytest -v -s v1/logits_processors
- pytest -v -s v1/worker - pytest -v -s v1/worker
...@@ -263,9 +272,6 @@ steps: ...@@ -263,9 +272,6 @@ steps:
- pytest -v -s v1/test_utils.py - pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_metrics_reader.py - pytest -v -s v1/test_metrics_reader.py
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
# Integration test for streaming correctness (requires special branch). # Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
...@@ -295,15 +301,6 @@ steps: ...@@ -295,15 +301,6 @@ steps:
- python3 offline_inference/basic/score.py - python3 offline_inference/basic/score.py
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/prefix_caching
commands:
- pytest -v -s prefix_caching
- label: Platform Tests (CUDA) - label: Platform Tests (CUDA)
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
source_file_dependencies: source_file_dependencies:
...@@ -328,7 +325,7 @@ steps: ...@@ -328,7 +325,7 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora - tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
parallelism: 4 parallelism: 4
- label: PyTorch Compilation Unit Tests - label: PyTorch Compilation Unit Tests
...@@ -345,6 +342,7 @@ steps: ...@@ -345,6 +342,7 @@ steps:
- pytest -v -s compile/test_sequence_parallelism.py - pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py - pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py - pytest -v -s compile/test_fusion_all_reduce.py
- pytest -v -s compile/test_decorator.py
- label: PyTorch Fullgraph Smoke Test # 9min - label: PyTorch Fullgraph Smoke Test # 9min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -358,6 +356,7 @@ steps: ...@@ -358,6 +356,7 @@ steps:
- pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py - pytest -v -s compile/piecewise/test_toy_llama.py
- pytest -v -s compile/piecewise/test_full_cudagraph.py - pytest -v -s compile/piecewise/test_full_cudagraph.py
- pytest -v -s compile/piecewise/test_multiple_graphs.py
- label: PyTorch Fullgraph Test # 18min - label: PyTorch Fullgraph Test # 18min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -404,6 +403,7 @@ steps: ...@@ -404,6 +403,7 @@ steps:
- csrc/moe/ - csrc/moe/
- tests/kernels/moe - tests/kernels/moe
- vllm/model_executor/layers/fused_moe/ - vllm/model_executor/layers/fused_moe/
- vllm/distributed/device_communicators/
commands: commands:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2 parallelism: 2
...@@ -462,19 +462,17 @@ steps: ...@@ -462,19 +462,17 @@ steps:
- tests/quantization - tests/quantization
commands: commands:
# temporary install here since we need nightly, will move to requirements/test.in # temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release # after torchao 0.12 release, and pin a working version of torchao nightly here
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: OpenAI API correctness - label: OpenAI API correctness
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -562,6 +560,14 @@ steps: ...@@ -562,6 +560,14 @@ steps:
commands: commands:
- pytest -v -s models/language/pooling -m 'not core_model' - pytest -v -s models/language/pooling -m 'not core_model'
- label: Multi-Modal Processor Test
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/processing
- label: Multi-Modal Models Test (Standard) - label: Multi-Modal Models Test (Standard)
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
torch_nightly: true torch_nightly: true
...@@ -571,9 +577,7 @@ steps: ...@@ -571,9 +577,7 @@ steps:
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch' - pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model
- pytest -v -s models/multimodal/test_tensor_schema.py -m core_model # Needs mp_method="spawn"
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Models Test (Extended) 1 - label: Multi-Modal Models Test (Extended) 1
...@@ -584,7 +588,7 @@ steps: ...@@ -584,7 +588,7 @@ steps:
- tests/models/multimodal - tests/models/multimodal
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model' - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
- label: Multi-Modal Models Test (Extended) 2 - label: Multi-Modal Models Test (Extended) 2
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -647,8 +651,10 @@ steps: ...@@ -647,8 +651,10 @@ steps:
- vllm/model_executor/layers/fused_moe/cutlass_moe.py - vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/fusion.py - vllm/compilation/fusion.py
- vllm/compilation/fusion_attn.py
commands: commands:
- nvidia-smi - nvidia-smi
- python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/basic/chat.py
...@@ -660,11 +666,17 @@ steps: ...@@ -660,11 +666,17 @@ steps:
# Quantization # Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
# Fusion # Fusion
- pytest -v -s tests/compile/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
...@@ -757,6 +769,11 @@ steps: ...@@ -757,6 +769,11 @@ steps:
- pytest -v -s plugins_tests/test_platform_plugins.py - pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y - pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests # end platform plugin tests
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
- pip install -e ./plugins/prithvi_io_processor_plugin
- pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test
# other tests continue here: # other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py - pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
...@@ -793,13 +810,14 @@ steps: ...@@ -793,13 +810,14 @@ steps:
# requires multi-GPU testing for validation. # requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_multi_loras_with_tp.py - pytest -v -s -x lora/test_llm_with_multi_loras.py
- label: Weight Loading Multiple GPU Test # 33min - label: Weight Loading Multiple GPU Test # 33min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/weight_loading - tests/weight_loading
...@@ -847,3 +865,10 @@ steps: ...@@ -847,3 +865,10 @@ steps:
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
- label: Qwen MoE EP Test # optional
gpu: h200
optional: true
num_gpus: 2
commands:
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep
/vllm/multimodal @DarkLight1337 @ywang96 /vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson /vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee /vllm/lora @jeejeelee
...@@ -25,11 +26,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson ...@@ -25,11 +26,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# vLLM V1 # vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb @aarnphm /vllm/v1/structured_output @mgoin @russellb @aarnphm
/vllm/v1/attention/backends/triton_attn.py @tdoublep
# Test ownership # Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo /.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
/tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao /tests/distributed/test_same_node.py @youkaichao
...@@ -44,6 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson ...@@ -44,6 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/structured_output @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee /tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
# Docs # Docs
/docs @hmellor /docs @hmellor
...@@ -72,3 +74,15 @@ mkdocs.yaml @hmellor ...@@ -72,3 +74,15 @@ mkdocs.yaml @hmellor
/vllm/model_executor/models/pixtral*.py @patrickvonplaten /vllm/model_executor/models/pixtral*.py @patrickvonplaten
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
# Kernels
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
/vllm/attention/ops/triton_unified_attention.py @tdoublep
# ROCm related: specify owner with write access to notify AMD folks for careful code review
/docker/Dockerfile.rocm* @gshtras
/vllm/v1/attention/backends/rocm*.py @gshtras
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
/vllm/attention/ops/rocm*.py @gshtras
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
...@@ -7,8 +7,6 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT ...@@ -7,8 +7,6 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
## Test Result ## Test Result
## (Optional) Documentation Update
--- ---
<details> <details>
<summary> Essential Elements of an Effective PR Description Checklist </summary> <summary> Essential Elements of an Effective PR Description Checklist </summary>
...@@ -17,6 +15,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT ...@@ -17,6 +15,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
- [ ] The test plan, such as providing test command. - [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results - [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model. - [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
</details> </details>
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions) **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment