diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml index ea10624f95c36de222f425d137e5d2759a7a0e9d..23a23723ad93f0e5c4b05c9b269edc6a39c41e30 100644 --- a/.buildkite/hardware_tests/amd.yaml +++ b/.buildkite/hardware_tests/amd.yaml @@ -1,6 +1,7 @@ -group: Hardware +group: Hardware - AMD Build steps: - label: "AMD: :docker: build image" + key: image-build-amd depends_on: [] device: amd_cpu no_plugin: true @@ -9,7 +10,7 @@ steps: docker build --build-arg max_jobs=16 --build-arg REMOTE_VLLM=1 - --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' + --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950' --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}" -f docker/Dockerfile.rocm diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml index b387cf93502d6666def6dc62fa46e2fe325ee501..5c181943cefd5b5ab264bba27c91d90e74aa939f 100644 --- a/.buildkite/hardware_tests/cpu.yaml +++ b/.buildkite/hardware_tests/cpu.yaml @@ -21,6 +21,20 @@ steps: pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py pytest -x -v -s tests/kernels/test_onednn.py" +- label: CPU-Compatibility Tests + depends_on: [] + soft_fail: true + device: intel_cpu + no_plugin: true + source_file_dependencies: + - cmake/cpu_extension.cmake + - setup.py + - vllm/platforms/cpu.py + commands: + - | + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m " + bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh" + - label: CPU-Language Generation and Pooling Model Tests depends_on: [] soft_fail: true diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh index f0bbaab77512a5e2008554b9ea107bbf09145eaa..9131dfc71a0ab64b93b2ce93ab5db0589ebc5580 100755 --- a/.buildkite/image_build/image_build.sh +++ b/.buildkite/image_build/image_build.sh @@ -8,7 +8,7 @@ clean_docker_tag() { } print_usage_and_exit() { - echo "Usage: $0 " + echo "Usage: $0 []" exit 1 } @@ -142,11 +142,16 @@ resolve_parent_commit() { print_bake_config() { echo "--- :page_facing_up: Resolved bake configuration" - BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json" + # Write to a temp directory to avoid polluting the repo root (which is the + # Docker build context). Files left in the repo root get COPY'd into the + # image and can cause duplicate artifact uploads from downstream steps. + local bake_tmp + bake_tmp="$(mktemp -d)" + BAKE_CONFIG_FILE="${bake_tmp}/bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json" docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true echo "Saved bake config to ${BAKE_CONFIG_FILE}" echo "--- :arrow_down: Uploading bake config to Buildkite" - buildkite-agent artifact upload "${BAKE_CONFIG_FILE}" + (cd "$(dirname "${BAKE_CONFIG_FILE}")" && buildkite-agent artifact upload "$(basename "${BAKE_CONFIG_FILE}")") } ################################# @@ -154,7 +159,7 @@ print_bake_config() { ################################# print_instance_info -if [[ $# -lt 7 ]]; then +if [[ $# -lt 5 ]]; then print_usage_and_exit fi @@ -163,10 +168,8 @@ REGISTRY=$1 REPO=$2 BUILDKITE_COMMIT=$3 BRANCH=$4 -VLLM_USE_PRECOMPILED=$5 -VLLM_MERGE_BASE_COMMIT=$6 -IMAGE_TAG=$7 -IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional +IMAGE_TAG=$5 +IMAGE_TAG_LATEST=${6:-} # only used for main branch, optional # build config TARGET="test-ci" @@ -193,8 +196,6 @@ export CACHE_FROM export CACHE_FROM_BASE_BRANCH export CACHE_FROM_MAIN export CACHE_TO -export VLLM_USE_PRECOMPILED -export VLLM_MERGE_BASE_COMMIT # print args echo "--- :mag: Arguments" @@ -202,8 +203,6 @@ echo "REGISTRY: ${REGISTRY}" echo "REPO: ${REPO}" echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}" echo "BRANCH: ${BRANCH}" -echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}" -echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}" echo "IMAGE_TAG: ${IMAGE_TAG}" echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}" diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml index 3026467bffce0a8ba569b9a51e1451388ff6aa6a..42eaed7ddaa0b44d64ddad063d34b107a1e3b45f 100644 --- a/.buildkite/image_build/image_build.yaml +++ b/.buildkite/image_build/image_build.yaml @@ -5,8 +5,7 @@ steps: depends_on: [] timeout_in_minutes: 600 commands: - - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi - - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi + - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi retry: automatic: - exit_status: -1 # Agent was lost diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh index a69732f430985c219c236fd8bbfb17aa61d677cd..ccfe155fa2b760d6e6575b4554a72c58fbf5167d 100755 --- a/.buildkite/image_build/image_build_cpu.sh +++ b/.buildkite/image_build/image_build_cpu.sh @@ -11,10 +11,10 @@ REPO=$2 BUILDKITE_COMMIT=$3 # authenticate with AWS ECR -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" # skip build if image already exists -if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then +if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu) ]]; then echo "Image not found, proceeding with build..." else echo "Image found" @@ -24,13 +24,11 @@ fi # build docker build --file docker/Dockerfile.cpu \ --build-arg max_jobs=16 \ - --build-arg buildkite_commit=$BUILDKITE_COMMIT \ - --build-arg VLLM_CPU_AVX512BF16=true \ - --build-arg VLLM_CPU_AVX512VNNI=true \ - --build-arg VLLM_CPU_AMXBF16=true \ - --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ + --build-arg buildkite_commit="$BUILDKITE_COMMIT" \ + --build-arg VLLM_CPU_X86=true \ + --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \ --target vllm-test \ --progress plain . # push -docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu +docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh index 615298b6555bd91579d53fd7c45e21a5df206345..ff3d11c8d5994cb73230d15286f7ee22bf222012 100755 --- a/.buildkite/image_build/image_build_cpu_arm64.sh +++ b/.buildkite/image_build/image_build_cpu_arm64.sh @@ -11,10 +11,10 @@ REPO=$2 BUILDKITE_COMMIT=$3 # authenticate with AWS ECR -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" # skip build if image already exists -if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then +if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu) ]]; then echo "Image not found, proceeding with build..." else echo "Image found" @@ -24,10 +24,10 @@ fi # build docker build --file docker/Dockerfile.cpu \ --build-arg max_jobs=16 \ - --build-arg buildkite_commit=$BUILDKITE_COMMIT \ - --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ + --build-arg buildkite_commit="$BUILDKITE_COMMIT" \ + --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu \ --target vllm-test \ --progress plain . # push -docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu +docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-arm64-cpu diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh index 192447ef4577e4fe744e8c86016c2097198602d0..60fa1789fa0648df5df2bf457a0bb0e5d1e3cb69 100755 --- a/.buildkite/image_build/image_build_hpu.sh +++ b/.buildkite/image_build/image_build_hpu.sh @@ -11,10 +11,10 @@ REPO=$2 BUILDKITE_COMMIT=$3 # authenticate with AWS ECR -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" # skip build if image already exists -if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then +if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then echo "Image not found, proceeding with build..." else echo "Image found" @@ -25,10 +25,10 @@ fi docker build \ --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \ --build-arg max_jobs=16 \ - --build-arg buildkite_commit=$BUILDKITE_COMMIT \ - --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \ + --build-arg buildkite_commit="$BUILDKITE_COMMIT" \ + --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu \ --progress plain \ https://github.com/vllm-project/vllm-gaudi.git # push -docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu +docker push "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh index 0745da8dc418d478d84df9c45978f5da19152f6c..518af9a660188c8414c4508c9759e578e35d81f0 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on chartqa for vllm. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.9.2" +# pip install "lm-eval[api]>=0.4.11" usage() { echo`` @@ -41,4 +41,4 @@ lm_eval --model vllm-vlm \ --tasks chartqa \ --batch_size auto \ --apply_chat_template \ - --limit $LIMIT + --limit "$LIMIT" diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh index 5c17a06245bcf6277decc55bb3236fd2e618eb34..f010ffe6752d967c7365a26141e85bc1fde14e9d 100755 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -2,7 +2,7 @@ # We can use this script to compute baseline accuracy on GSM for transformers. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.9.2" +# pip install "lm-eval[api]>=0.4.11" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index 1b617ff17c41c3f7e2b4e13aed8ad9b0938fa2e8..fec4a94e63e4450ab5e6cc5fd56fee06a143be17 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.9.2" +# pip install "lm-eval[api]>=0.4.11" usage() { echo`` diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh index 12336d7f85bc918cd5776d82fffeca518f474180..e3c6e16bd6b30e33804abfc862b262d1e440f933 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh @@ -3,7 +3,7 @@ # We use this for fp8, which HF does not support. # # Make sure you have lm-eval-harness installed: -# pip install "lm-eval[api]>=0.4.9.2" +# pip install "lm-eval[api]>=0.4.11" usage() { echo`` @@ -20,14 +20,11 @@ usage() { echo } -while getopts "m:b:l:f:t:" OPT; do +while getopts "m:l:f:t:" OPT; do case ${OPT} in m ) MODEL="$OPTARG" ;; - b ) - BATCH_SIZE="$OPTARG" - ;; l ) LIMIT="$OPTARG" ;; diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index a22abe73e39f72abdab84e51a38324b696ef7cf0..fad5f593be4f46cbafdcd9857369b2001e9452da 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -13,9 +13,10 @@ import os from contextlib import contextmanager import lm_eval -import numpy as np import yaml +from vllm.platforms import current_platform + DEFAULT_RTOL = 0.08 @@ -63,6 +64,9 @@ def launch_lm_eval(eval_config, tp_size): "allow_deprecated_quantization=True," ) + if current_platform.is_rocm() and "Nemotron-3" in eval_config["model_name"]: + model_args += "attention_backend=TRITON_ATTN" + env_vars = eval_config.get("env_vars", None) with scoped_env_vars(env_vars): results = lm_eval.simple_evaluate( @@ -102,6 +106,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size): f"ground_truth={ground_truth:.3f} | " f"measured={measured_value:.3f} | rtol={rtol}" ) - success = success and np.isclose(ground_truth, measured_value, rtol=rtol) + + min_acceptable = ground_truth * (1 - rtol) + success = success and measured_value >= min_acceptable assert success diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md index 289877e504bbda8cd4fba7b2fb4b32ae50c19977..3a321c0fefdf135b0234b3522cec5d975f577b4f 100644 --- a/.buildkite/performance-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -83,7 +83,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co "server_parameters": { "model": "meta-llama/Meta-Llama-3-8B", "tensor_parallel_size": 1, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy" }, diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index b3d0a2d3bbce0b6804a4fdd0ac177628bd860ebb..c9f8139fe62f0663aeb199523d0701f5a89c47f4 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -7,8 +7,10 @@ import argparse import html as _html import json import os +from contextlib import nullcontext from dataclasses import dataclass from importlib import util +from pathlib import Path import pandas as pd @@ -31,6 +33,45 @@ pd.set_option("display.precision", 2) pd.set_option("display.float_format", lambda x: f"{x:.2f}") +# ----------------------------- +# Concurrency normalization (NEW, small) +# ----------------------------- +def _find_concurrency_col(df: pd.DataFrame) -> str: + for c in [ + "# of max concurrency.", + "# of max concurrency", + "Max Concurrency", + "max_concurrency", + "Concurrency", + ]: + if c in df.columns: + return c + + for c in df.columns: + if "concurr" in str(c).lower(): + s = df[c] + if s.dtype.kind in "iu" and s.nunique() > 1 and s.min() >= 1: + return c + + raise ValueError( + "Cannot infer concurrency column. " + "Please rename the column to one of the known names " + "or add an explicit override (e.g., --concurrency-col)." + ) + + +def _normalize_concurrency_in_df( + df: pd.DataFrame, canonical: str = "# of max concurrency." +) -> pd.DataFrame: + if canonical in df.columns: + return df + detected = _find_concurrency_col(df) + if detected in df.columns and detected != canonical: + return df.rename(columns={detected: canonical}) + df[canonical] = pd.NA + return df + + # ----------------------------- # Core data compare # ----------------------------- @@ -50,19 +91,25 @@ def compare_data_columns( - Concat along axis=1 (indexes align), then reset_index so callers can group by columns. - If --debug, add a _name column per file. + + Minimal fix to support different max_concurrency lists across files: + - normalize concurrency column naming to "# of max concurrency." + - align on UNION of keys (missing points become NaN) + - BUGFIX: don't drop throughput rows based on P99/Median presence """ print("\ncompare_data_column:", data_column) frames = [] raw_data_cols: list[str] = [] - compare_frames = [] + # Determine key cols after normalizing concurrency cols_per_file: list[set] = [] for f in files: try: df_tmp = pd.read_json(f, orient="records") except Exception as err: raise ValueError(f"Failed to read {f}") from err + df_tmp = _normalize_concurrency_in_df(df_tmp, canonical="# of max concurrency.") cols_per_file.append(set(df_tmp.columns)) key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)] @@ -73,12 +120,25 @@ def compare_data_columns( "No common key columns found from info_cols across the input files." ) - meta_added = False + union_index = None + metas: list[pd.DataFrame] = [] + staged: list[tuple[str, pd.Series, pd.Series | None]] = [] for file in files: df = pd.read_json(file, orient="records") - - if drop_column in df.columns: + df = _normalize_concurrency_in_df(df, canonical="# of max concurrency.") + + # BUGFIX: only drop rows for latency-like metrics; throughput rows may have + # NaN in P99/Median columns even if the column exists in the JSON. + metric_lc = str(data_column).lower() + is_latency_metric = ( + "ttft" in metric_lc + or "tpot" in metric_lc + or "p99" in metric_lc + or "median" in metric_lc + or metric_lc.strip() in {"p99", "median"} + ) + if is_latency_metric and drop_column in df.columns: df = df.dropna(subset=[drop_column], ignore_index=True) for c in ( @@ -103,35 +163,61 @@ def compare_data_columns( meta = meta.groupby(level=key_cols, dropna=False).first() file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file) - s = df_idx[data_column] - if not s.index.is_unique: - s = s.groupby(level=key_cols, dropna=False).mean() - s.name = file_label - if not meta_added: - frames.append(meta) - meta_added = True + if data_column in df_idx.columns: + s = df_idx[data_column] + if not s.index.is_unique: + s = s.groupby(level=key_cols, dropna=False).mean() + else: + # keep NA series to preserve meta keys for union_index + s = pd.Series(pd.NA, index=meta.index) + s.name = file_label + name_s = None if debug and name_column in df_idx.columns: name_s = df_idx[name_column] if not name_s.index.is_unique: name_s = name_s.groupby(level=key_cols, dropna=False).first() name_s.name = f"{file_label}_name" - frames.append(name_s) - frames.append(s) + if union_index is None: + union_index = meta.index + else: + union_index = union_index.union(meta.index) + metas.append(meta) + + staged.append((file_label, s, name_s)) + + if union_index is None: + raise ValueError("No data found after loading inputs.") + + # meta first (union-aligned): build UNION meta across all files + if metas: + meta_union = pd.concat(metas, axis=0) + # Collapse duplicates on the MultiIndex; keep first non-null per column + meta_union = meta_union.groupby(level=key_cols, dropna=False).first() + frames.append(meta_union.reindex(union_index)) + + # values + ratios (union-aligned) + metric_series_aligned: list[pd.Series] = [] + for file_label, s, name_s in staged: + s_aligned = s.reindex(union_index) + frames.append(s_aligned) raw_data_cols.append(file_label) - compare_frames.append(s) + metric_series_aligned.append(s_aligned) + + if debug and name_s is not None: + frames.append(name_s.reindex(union_index)) - if len(compare_frames) >= 2: - base = compare_frames[0] - current = compare_frames[-1] - if "P99" in data_column or "Median" in data_column: + if len(metric_series_aligned) >= 2: + base = metric_series_aligned[0] + current = metric_series_aligned[-1] + if "P99" in str(data_column) or "Median" in str(data_column): ratio = base / current else: ratio = current / base ratio = ratio.mask(base == 0) - ratio.name = f"Ratio 1 vs {len(compare_frames)}" + ratio.name = f"Ratio 1 vs {len(metric_series_aligned)}" frames.append(ratio) concat_df = pd.concat(frames, axis=1).reset_index(drop=True) @@ -202,24 +288,10 @@ def split_json_by_tp_pp( # ----------------------------- # Styling helpers # ----------------------------- -def _find_concurrency_col(df: pd.DataFrame) -> str: - for c in [ - "# of max concurrency.", - "# of max concurrency", - "Max Concurrency", - "max_concurrency", - "Concurrency", - ]: - if c in df.columns: - return c - for c in df.columns: - if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1: - return c - return "# of max concurrency." - - def _highlight_threshold( - df: pd.DataFrame, threshold: float + df: pd.DataFrame, + threshold: float, + slack_pct: float = 0.0, ) -> pd.io.formats.style.Styler: conc_col = _find_concurrency_col(df) key_cols = [ @@ -232,12 +304,24 @@ def _highlight_threshold( ] conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] - return df.style.map( - lambda v: "background-color:#e6ffe6;font-weight:bold;" - if pd.notna(v) and v <= threshold - else "", - subset=conf_cols, - ) + try: + slack_pct = float(slack_pct or 0.0) + except Exception: + slack_pct = 0.0 + slack_limit = threshold * (1.0 + slack_pct / 100.0) + + def _cell(v): + if pd.isna(v): + return "" + if v <= threshold: + # Strict SLA + return "background-color:#e6ffe6;font-weight:bold;" + if v <= slack_limit: + # Within slack range + return "background-color:#ffe5cc;font-weight:bold;" + return "" + + return df.style.map(_cell, subset=conf_cols) def highlight_ratio_columns(styler: pd.io.formats.style.Styler): @@ -275,6 +359,177 @@ def _apply_two_decimals( return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="") +# ----------------------------- +# Export helpers (Excel + CSV) +# ----------------------------- +def _sanitize_sheet_name(name: str) -> str: + """ + Excel sheet constraints: + - max 31 chars + - cannot contain: : \ / ? * [ ] + - cannot be empty + + NOTE: Use fast, non-regex operations here to avoid the third-party `regex` + module's compile overhead/edge-cases on some systems. + """ + name = "sheet" if name is None else str(name) + + # Replace illegal characters with underscore. + trans = str.maketrans( + { + ":": "_", + "\\": "_", + "/": "_", + "?": "_", + "*": "_", + "[": "_", + "]": "_", + } + ) + name = name.translate(trans) + + # Strip quotes/spaces and collapse whitespace. + name = name.strip().strip("'") + name = " ".join(name.split()) + + if not name: + name = "sheet" + return name[:31] + + +def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str: + d = dict(zip(group_cols, gkey_tuple)) + + # Always keep input/output lengths (these are important). + ilen = d.get("Input Len", "") + olen = d.get("Output Len", "") + lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else "" + + # Shorten model name aggressively to make room for lens. + model = d.get("Model", "model") + leaf = str(model).split("/")[-1] + + max_model_len = max(1, 31 - len(lens)) + model_short = leaf[:max_model_len] + + return _sanitize_sheet_name(f"{model_short}{lens}") + + +def _write_tables_to_excel_sheet( + writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]] +): + """Write all blocks to a sheet with a single to_excel() call. + + Pandas+openpyxl can be extremely slow when called many times per sheet. + We flatten blocks into one table with a 'Section' column to keep structure + while making Excel generation fast and deterministic. + """ + if not blocks: + pd.DataFrame().to_excel(writer, sheet_name=sheet, index=False) + return + + combined_parts: list[pd.DataFrame] = [] + for title, df in blocks: + df2 = df.copy() + # Put the section label as the first column for readability. + df2.insert(0, "Section", title) + combined_parts.append(df2) + + combined = pd.concat(combined_parts, axis=0, ignore_index=True, sort=False) + combined.to_excel(writer, sheet_name=sheet, index=False) + + +def _safe_filename(s: str) -> str: + # Fast path without the third-party `regex` module. + s = " ".join(str(s).strip().split()) + allowed = [] + for ch in s: + if ch.isalnum() or ch in "._-": + allowed.append(ch) + else: + allowed.append("_") + out = "".join(allowed) + return out[:180] if len(out) > 180 else out + + +# ----------------------------- +# vLLM environment export helper +# ----------------------------- +def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame: + """Parse vllm_env.txt into a flat table (Section, Key, Value). + + Supports: + - section headers as standalone lines (no ':' or '=') + - key-value lines like 'OS: Ubuntu ...' + - env var lines like 'HF_HOME=/data/hf' + """ + lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines() + section = "General" + rows: list[dict] = [] + + def set_section(s: str): + nonlocal section + s = (s or "").strip() + if s: + section = s + + for raw in lines: + stripped = raw.strip() + if not stripped: + continue + # divider lines like ===== + if set(stripped) <= {"="}: + continue + + # section header heuristic: short standalone line + if ":" not in stripped and "=" not in stripped and len(stripped) <= 64: + if stripped.lower().startswith("collecting environment information"): + continue + set_section(stripped) + continue + + # env var style: KEY=VALUE (and not a URL with :) + if "=" in stripped and ":" not in stripped: + k, v = stripped.split("=", 1) + k = k.strip() + v = v.strip() + if k: + rows.append({"Section": section, "Key": k, "Value": v}) + continue + + # key: value + if ":" in stripped: + k, v = stripped.split(":", 1) + k = k.strip() + v = v.strip() + if k: + rows.append({"Section": section, "Key": k, "Value": v}) + continue + + return pd.DataFrame(rows, columns=["Section", "Key", "Value"]) + + +def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None: + """Load vllm_env.txt next to the *original* input JSON file. + + Note: when only one -f is provided, the script may split JSON into ./splits/..., + but vllm_env.txt typically lives next to the original benchmark_results.json. + """ + base_dir: Path | None = None + if getattr(args, "file", None): + base_dir = Path(args.file[0]).resolve().parent + elif files: + base_dir = Path(files[0]).resolve().parent + if base_dir is None: + return None + + env_path = base_dir / "vllm_env.txt" + if not env_path.exists(): + return None + df = _parse_vllm_env_txt(env_path) + return df + + # ----------------------------- # Valid max concurrency summary helpers # ----------------------------- @@ -301,7 +556,11 @@ def _config_value_columns(df: pd.DataFrame, conc_col: str) -> list[str]: def _max_concurrency_ok( - df: pd.DataFrame, conc_col: str, cfg_col: str, threshold: float + df: pd.DataFrame, + conc_col: str, + cfg_col: str, + threshold: float, + slack_pct: float = 0.0, ): if df is None or conc_col not in df.columns or cfg_col not in df.columns: return pd.NA @@ -314,7 +573,14 @@ def _max_concurrency_ok( if d.empty: return pd.NA - ok = d[d[cfg_col] <= threshold] + # Accept values up to (1 + slack_pct%) above the SLA. + try: + slack_pct = float(slack_pct or 0.0) + except Exception: + slack_pct = 0.0 + effective_limit = float(threshold) * (1.0 + slack_pct / 100.0) + + ok = d[d[cfg_col] <= effective_limit] if ok.empty: return pd.NA @@ -380,15 +646,25 @@ def build_valid_max_concurrency_summary_html( if not cfg_cols: cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str) + # Display SLA ranges in the table header (SLA .. SLA*(1+slack)) + ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0) + tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0) + ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)" + tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)" + rows = [] for cfg in cfg_cols: ttft_max = ( - _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms) + _max_concurrency_ok( + ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct + ) if ttft_group_df is not None else pd.NA ) tpot_max = ( - _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms) + _max_concurrency_ok( + tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct + ) if tpot_group_df is not None else pd.NA ) @@ -417,8 +693,8 @@ def build_valid_max_concurrency_summary_html( rows.append( { "Configuration": cfg, - f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max, - f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max, + f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max, + f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max, f"Max {conc_col} (Both)": both, "Output Tput @ Both (tok/s)": tput_at_both, "TTFT @ Both (ms)": ttft_at_both, @@ -428,7 +704,6 @@ def build_valid_max_concurrency_summary_html( summary_df = pd.DataFrame(rows) - # --- Coerce numeric columns so Styler doesn't miss them due to object dtype --- for c in summary_df.columns: if c == "Configuration": continue @@ -436,12 +711,10 @@ def build_valid_max_concurrency_summary_html( both_col = f"Max {conc_col} (Both)" - # --- Strict 2-decimal formatting for ALL non-Configuration columns --- formatters = {} for c in summary_df.columns: if c == "Configuration": continue - # default argument binds per-column formatter correctly formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}" styler = summary_df.style.format(formatters) @@ -460,6 +733,104 @@ def build_valid_max_concurrency_summary_html( return title + styler.to_html(table_attributes='border="1" class="dataframe"') +def build_valid_max_concurrency_summary_df( + tput_group_df: pd.DataFrame | None, + ttft_group_df: pd.DataFrame | None, + tpot_group_df: pd.DataFrame | None, + conc_col: str, + args, +) -> pd.DataFrame | None: + if ttft_group_df is None and tpot_group_df is None: + return None + + ttft_cols = ( + _config_value_columns(ttft_group_df, conc_col) + if ttft_group_df is not None + else [] + ) + tpot_cols = ( + _config_value_columns(tpot_group_df, conc_col) + if tpot_group_df is not None + else [] + ) + tput_cols = ( + _config_value_columns(tput_group_df, conc_col) + if tput_group_df is not None + else [] + ) + + if ttft_group_df is not None and tpot_group_df is not None: + cfg_cols = [c for c in ttft_cols if c in tpot_cols] + if tput_group_df is not None: + cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols + else: + cfg_cols = ttft_cols or tpot_cols + + if not cfg_cols: + cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str) + + ttft_hi = args.ttft_max_ms * (1.0 + args.ttft_slack_pct / 100.0) + tpot_hi = args.tpot_max_ms * (1.0 + args.tpot_slack_pct / 100.0) + ttft_range = f"{args.ttft_max_ms:g}–{ttft_hi:g} ms (+{args.ttft_slack_pct:g}%)" + tpot_range = f"{args.tpot_max_ms:g}–{tpot_hi:g} ms (+{args.tpot_slack_pct:g}%)" + + rows = [] + for cfg in cfg_cols: + ttft_max = ( + _max_concurrency_ok( + ttft_group_df, conc_col, cfg, args.ttft_max_ms, args.ttft_slack_pct + ) + if ttft_group_df is not None + else pd.NA + ) + tpot_max = ( + _max_concurrency_ok( + tpot_group_df, conc_col, cfg, args.tpot_max_ms, args.tpot_slack_pct + ) + if tpot_group_df is not None + else pd.NA + ) + both = ( + pd.NA + if (pd.isna(ttft_max) or pd.isna(tpot_max)) + else min(ttft_max, tpot_max) + ) + + tput_at_both = ( + _value_at_concurrency(tput_group_df, conc_col, cfg, both) + if tput_group_df is not None + else pd.NA + ) + ttft_at_both = ( + _value_at_concurrency(ttft_group_df, conc_col, cfg, both) + if ttft_group_df is not None + else pd.NA + ) + tpot_at_both = ( + _value_at_concurrency(tpot_group_df, conc_col, cfg, both) + if tpot_group_df is not None + else pd.NA + ) + + rows.append( + { + "Configuration": cfg, + f"Max {conc_col} (TTFT ≤ {ttft_range})": ttft_max, + f"Max {conc_col} (TPOT ≤ {tpot_range})": tpot_max, + f"Max {conc_col} (Both)": both, + "Output Tput @ Both (tok/s)": tput_at_both, + "TTFT @ Both (ms)": ttft_at_both, + "TPOT @ Both (ms)": tpot_at_both, + } + ) + + df = pd.DataFrame(rows) + for c in df.columns: + if c != "Configuration": + df[c] = pd.to_numeric(df[c], errors="coerce") + return df + + # ----------------------------- # Plot helper # ----------------------------- @@ -537,6 +908,35 @@ def build_parser() -> argparse.ArgumentParser: default=100.0, help="Reference limit for TPOT plots (ms)", ) + + # ---- SLA tolerance (slack) options ---- + parser.add_argument( + "--ttft-slack-pct", + type=float, + default=5.0, + help="Allowed percentage above TTFT SLA (default: 5).", + ) + parser.add_argument( + "--tpot-slack-pct", + type=float, + default=5.0, + help="Allowed percentage above TPOT SLA (default: 5).", + ) + + # ---- export options ---- + parser.add_argument( + "--excel-out", + type=str, + default="perf_comparison.xlsx", + help="Write one sheet per (Model, Dataset, Input Len, Output Len).", + ) + parser.add_argument( + "--csv-out-dir", + type=str, + default="", + help="If set, write per-group per-metric CSVs into this directory.", + ) + return parser @@ -615,9 +1015,13 @@ def render_metric_table_html( metric_name = metric_label.lower() if "ttft" in metric_name: - styler = _highlight_threshold(display_group, args.ttft_max_ms) + styler = _highlight_threshold( + display_group, args.ttft_max_ms, args.ttft_slack_pct + ) elif ("tpot" in metric_name) or ("median" in metric_name) or ("p99" in metric_name): - styler = _highlight_threshold(display_group, args.tpot_max_ms) + styler = _highlight_threshold( + display_group, args.tpot_max_ms, args.tpot_slack_pct + ) else: styler = display_group.style @@ -657,7 +1061,6 @@ def maybe_write_plot( markers=True, ) - # Ensure plot hover + y tick labels are also 2 decimals. fig.update_traces(hovertemplate="%{y:.2f}") fig.update_yaxes(tickformat=".2f") @@ -730,87 +1133,186 @@ def write_report_group_first( for metric_label, (df, _) in metric_cache.items() } - with open("perf_comparison.html", "w", encoding="utf-8") as main_fh: - main_fh.write('\n') - for gkey in group_keys: - gkey_tuple = normalize_group_key(gkey) - suffix = build_group_suffix(group_cols_canonical, gkey_tuple) - sub_path = group_filename(gkey_tuple) - group_header = ( - '
' - f"{_html.escape(suffix)}" - "
\n" - ) - - main_fh.write(group_header) - with open(sub_path, "w", encoding="utf-8") as sub_fh: - sub_fh.write('\n') - sub_fh.write(group_header) - tput_group_df = None - ttft_group_df = None - tpot_group_df = None - conc_col = args.xaxis - - for metric_label in plan.data_cols: - gb = metric_groupbys[metric_label] - df_sorted, raw_data_cols = metric_cache[metric_label] - - try: - group_df = gb.get_group(gkey) - except KeyError: - missing = ( - '
' - f"{_html.escape(metric_label)} — missing for this group" - "
\n" - ) + csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None + if csv_dir: + csv_dir.mkdir(parents=True, exist_ok=True) - main_fh.write(missing) - sub_fh.write(missing) - continue + excel_path = args.excel_out or "perf_comparison.xlsx" + disable_excel = os.getenv("VLLM_COMPARE_DISABLE_EXCEL", "0") == "1" - if conc_col not in group_df.columns: - conc_col = _find_concurrency_col(group_df) + # Prefer xlsxwriter for speed; fallback to openpyxl if unavailable. + excel_engine = ( + os.getenv("VLLM_COMPARE_EXCEL_ENGINE", "xlsxwriter").strip() or "xlsxwriter" + ) + if excel_engine == "xlsxwriter" and util.find_spec("xlsxwriter") is None: + excel_engine = "openpyxl" + + excel_engine_kwargs = {} + if excel_engine == "xlsxwriter": + # Reduce memory pressure & usually faster writes. + excel_engine_kwargs = {"options": {"constant_memory": True}} + + xw_ctx = ( + nullcontext(None) + if disable_excel + else pd.ExcelWriter( + excel_path, engine=excel_engine, engine_kwargs=excel_engine_kwargs + ) + ) + with xw_ctx as xw: + used_sheets: set[str] = set() + # ---- Environment sheet (first) ---- + env_sheet = _sanitize_sheet_name("Environment") + env_df = _load_env_df_for_inputs(args, files) + if xw is not None: + if env_df is None or env_df.empty: + pd.DataFrame( + [ + { + "Section": "Environment", + "Key": "vllm_env.txt", + "Value": "NOT FOUND (or empty)", + } + ] + ).to_excel(xw, sheet_name=env_sheet, index=False) + else: + env_df.to_excel(xw, sheet_name=env_sheet, index=False) + used_sheets.add(env_sheet) + with open("perf_comparison.html", "w", encoding="utf-8") as main_fh: + main_fh.write('\n') + for gkey in group_keys: + gkey_tuple = normalize_group_key(gkey) + suffix = build_group_suffix(group_cols_canonical, gkey_tuple) + sub_path = group_filename(gkey_tuple) + group_header = ( + '
' + f"{_html.escape(suffix)}" + "
\n" + ) - mn = metric_label.lower().strip() - if "tok/s" in mn: - tput_group_df = group_df - elif "ttft" in mn: - ttft_group_df = group_df - elif mn in ("p99", "median") or "tpot" in mn: - tpot_group_df = group_df + main_fh.write(group_header) + + do_excel = xw is not None + sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple) + sheet_base = sheet + if do_excel: + dedup_i = 1 + while sheet in used_sheets: + dedup_i += 1 + suffix = f"_{dedup_i}" + # Ensure uniqueness even when sheet names are truncated. + base = str(sheet_base) + keep = max(1, 31 - len(suffix)) + sheet = _sanitize_sheet_name(base[:keep] + suffix) + used_sheets.add(sheet) + + excel_blocks: list[tuple[str, pd.DataFrame]] = [] + + with open(sub_path, "w", encoding="utf-8") as sub_fh: + sub_fh.write('\n') + sub_fh.write(group_header) + tput_group_df = None + ttft_group_df = None + tpot_group_df = None + conc_col = args.xaxis + + for metric_label in plan.data_cols: + gb = metric_groupbys[metric_label] + df_sorted, raw_data_cols = metric_cache[metric_label] + + try: + group_df = gb.get_group(gkey) + except KeyError: + missing = ( + '
' + f"{_html.escape(metric_label)} — missing for this group" + "
\n" + ) + main_fh.write(missing) + sub_fh.write(missing) + continue + + if conc_col not in group_df.columns: + conc_col = _find_concurrency_col(group_df) + + mn = metric_label.lower().strip() + if "tok/s" in mn: + tput_group_df = group_df + elif "ttft" in mn: + ttft_group_df = group_df + elif mn in ("p99", "median") or "tpot" in mn: + tpot_group_df = group_df + + display_group = group_df.drop( + columns=group_cols_canonical, errors="ignore" + ) - display_group = group_df.drop( - columns=group_cols_canonical, errors="ignore" - ) + html = render_metric_table_html( + display_group, metric_label, suffix, args + ) + main_fh.write(html) + sub_fh.write(html) + + maybe_write_plot( + main_fh, + sub_fh, + group_df=group_df, + raw_data_cols=raw_data_cols, + metric_label=metric_label, + y_axis_col=y_axis_col, + args=args, + ) - html = render_metric_table_html( - display_group, metric_label, suffix, args + excel_blocks.append( + (metric_label, group_df.reset_index(drop=True)) + ) + if csv_dir: + fn = _safe_filename( + f"{sheet}__{metric_label}".replace(" ", "_").replace( + "/", "_" + ) + ) + group_df.to_csv(csv_dir / f"{fn}.csv", index=False) + + summary_html = build_valid_max_concurrency_summary_html( + tput_group_df=tput_group_df, + ttft_group_df=ttft_group_df, + tpot_group_df=tpot_group_df, + conc_col=conc_col, + args=args, ) - main_fh.write(html) - sub_fh.write(html) - - maybe_write_plot( - main_fh, - sub_fh, - group_df=group_df, - raw_data_cols=raw_data_cols, - metric_label=metric_label, - y_axis_col=y_axis_col, + if summary_html: + main_fh.write(summary_html) + sub_fh.write(summary_html) + + summary_df = build_valid_max_concurrency_summary_df( + tput_group_df=tput_group_df, + ttft_group_df=ttft_group_df, + tpot_group_df=tpot_group_df, + conc_col=conc_col, args=args, ) + if summary_df is not None: + excel_blocks.append( + ("Valid Max Concurrency Summary", summary_df) + ) + if csv_dir: + fn = _safe_filename( + f"{sheet}__Valid_Max_Concurrency_Summary" + ) + summary_df.to_csv(csv_dir / f"{fn}.csv", index=False) - summary_html = build_valid_max_concurrency_summary_html( - tput_group_df=tput_group_df, - ttft_group_df=ttft_group_df, - tpot_group_df=tpot_group_df, - conc_col=conc_col, - args=args, - ) - if summary_html: - main_fh.write(summary_html) - sub_fh.write(summary_html) + if do_excel: + _write_tables_to_excel_sheet(xw, sheet, excel_blocks) + + if disable_excel: + print("Skipped Excel generation (VLLM_COMPARE_DISABLE_EXCEL=1).") + else: + print(f"Wrote Excel: {excel_path}") + if csv_dir: + print(f"Wrote CSVs under: {csv_dir}") def main(): diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh old mode 100755 new mode 100644 index d62c01bc7b0911c0bbb06cec4a84cdbd31f8166a..91032978eca94b86734de938e17b6384d0034940 --- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -1,6 +1,4 @@ #!/bin/bash - -# This script should be run inside the CI process # This script assumes that we are already inside the vllm/ directory # Benchmarking results will be available inside vllm/benchmarks/results/ @@ -9,14 +7,26 @@ set -x set -o pipefail +# Environment-driven debug controls (like ON_CPU=1) +DRY_RUN="${DRY_RUN:-0}" +MODEL_FILTER="${MODEL_FILTER:-}" +DTYPE_FILTER="${DTYPE_FILTER:-}" + +# Adaptive search controls +ENABLE_ADAPTIVE_CONCURRENCY="${ENABLE_ADAPTIVE_CONCURRENCY:-0}" +SLA_TTFT_MS="${SLA_TTFT_MS:-3000}" +SLA_TPOT_MS="${SLA_TPOT_MS:-100}" +ADAPTIVE_MAX_PROBES="${ADAPTIVE_MAX_PROBES:-8}" +ADAPTIVE_MAX_CONCURRENCY="${ADAPTIVE_MAX_CONCURRENCY:-1024}" + check_gpus() { if command -v nvidia-smi; then # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + declare -g gpu_count=$(nvidia-smi --list-gpus | grep -c . || true) elif command -v amd-smi; then - declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l) + declare -g gpu_count=$(amd-smi list | grep -c 'GPU' || true) elif command -v hl-smi; then - declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l) + declare -g gpu_count=$(hl-smi --list | grep -ci "Module ID" || true) fi if [[ $gpu_count -gt 0 ]]; then @@ -44,7 +54,7 @@ check_cpus() { declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}') if [[ $numa_count -gt 0 ]]; then echo "NUMA found." - echo $numa_count + echo "$numa_count" else echo "Need at least 1 NUMA to run benchmarking." exit 1 @@ -112,13 +122,12 @@ json2envs() { } wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes local timeout_val="1200" timeout "$timeout_val" bash -c ' - until curl -X POST localhost:8000/v1/completions; do + until curl -sf http://localhost:8000/v1/models >/dev/null; do sleep 1 - done' && return 0 || return 1 + done + ' } kill_processes_launched_by_current_bash() { @@ -181,6 +190,304 @@ upload_to_buildkite() { $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" } +# ------------------------------- +# Adaptive concurrency helpers +# ------------------------------- +result_json_path_for_serving() { + local test_name=$1 + local qps=$2 + local max_concurrency=$3 + echo "$RESULTS_FOLDER/${test_name}_qps_${qps}_concurrency_${max_concurrency}.json" +} + +extract_metric_ms() { + local metric_name=$1 + local json_file=$2 + + [[ -f "$json_file" ]] || return 0 + + if [[ "$metric_name" == "ttft" ]]; then + jq -r ' + [ + .ttft_ms.p99?, + .metrics.ttft_ms.p99?, + .ttft.p99?, + .metrics.ttft.p99?, + .p99_ttft_ms?, + .ttft_ms.mean?, + .metrics.ttft_ms.mean?, + .ttft.mean?, + .metrics.ttft.mean?, + .mean_ttft_ms? + ] | map(select(. != null)) | .[0] // empty + ' "$json_file" + else + jq -r ' + [ + .tpot_ms.p99?, + .metrics.tpot_ms.p99?, + .tpot.p99?, + .metrics.tpot.p99?, + .p99_tpot_ms?, + .itl_ms.p99?, + .metrics.itl_ms.p99?, + .inter_token_latency_ms.p99?, + .tpot_ms.mean?, + .metrics.tpot_ms.mean?, + .tpot.mean?, + .metrics.tpot.mean?, + .itl_ms.mean?, + .metrics.itl_ms.mean?, + .mean_tpot_ms?, + .mean_itl_ms? + ] | map(select(. != null)) | .[0] // empty + ' "$json_file" + fi +} + +evaluate_sla_from_json() { + local json_file=$1 + local ttft + local tpot + local pass + + [[ -f "$json_file" ]] || return 2 + + ttft=$(extract_metric_ms ttft "$json_file") + tpot=$(extract_metric_ms tpot "$json_file") + + [[ -n "$ttft" && -n "$tpot" ]] || return 2 + + pass=$(jq -n \ + --argjson ttft "$ttft" \ + --argjson tpot "$tpot" \ + --argjson sla_ttft "$SLA_TTFT_MS" \ + --argjson sla_tpot "$SLA_TPOT_MS" \ + '($ttft <= $sla_ttft) and ($tpot <= $sla_tpot)') + + [[ "$pass" == "true" ]] +} + +write_adaptive_summary_json() { + local summary_file=$1 + local test_name=$2 + local qps=$3 + local static_last_pass=$4 + local static_first_fail=$5 + local final_last_pass=$6 + local final_first_fail=$7 + + jq -n \ + --arg test_name "$test_name" \ + --arg qps "$qps" \ + --argjson sla_ttft "$SLA_TTFT_MS" \ + --argjson sla_tpot "$SLA_TPOT_MS" \ + --arg static_last_pass "${static_last_pass:-}" \ + --arg static_first_fail "${static_first_fail:-}" \ + --arg final_last_pass "${final_last_pass:-}" \ + --arg final_first_fail "${final_first_fail:-}" \ + '{ + test_name: $test_name, + qps: $qps, + sla_ttft_ms: $sla_ttft, + sla_tpot_ms: $sla_tpot, + static_last_pass: (if $static_last_pass == "" then null else ($static_last_pass | tonumber) end), + static_first_fail: (if $static_first_fail == "" then null else ($static_first_fail | tonumber) end), + final_last_pass: (if $final_last_pass == "" then null else ($final_last_pass | tonumber) end), + final_first_fail: (if $final_first_fail == "" then null else ($final_first_fail | tonumber) end) + }' > "$summary_file" +} + +run_single_serving_probe() { + local test_name=$1 + local qps=$2 + local max_concurrency=$3 + local tp=$4 + local compilation_config_mode=$5 + local optimization_level=$6 + local client_args_effective=$7 + local client_remote_args=$8 + local server_command=$9 + + local new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}" + local result_json + local num_prompts_arg="" + local client_command + + result_json=$(result_json_path_for_serving "$test_name" "$qps" "$max_concurrency") + + if [[ -f "$result_json" ]]; then + evaluate_sla_from_json "$result_json" + return $? + fi + + if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then + num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY )) + if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi + if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi + num_prompts_arg="--num-prompts $num_prompts" + fi + + client_command="vllm bench serve \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + --max-concurrency $max_concurrency \ + $num_prompts_arg \ + --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level adaptive_search=1 \ + $client_args_effective $client_remote_args " + + echo "Adaptive probe: $client_command" + + if [[ "${DRY_RUN:-0}" != "1" ]]; then + bash -c "$client_command" + fi + + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu, + adaptive_search: true + }') + echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands" + + evaluate_sla_from_json "$result_json" +} + +adaptive_refine_from_static_results() { + local test_name=$1 + local qps=$2 + local max_concurrency_list_raw=$3 + local tp=$4 + local compilation_config_mode=$5 + local optimization_level=$6 + local client_args_effective=$7 + local client_remote_args=$8 + local server_command=$9 + + local sorted_points + local point + local rc + local static_last_pass="" + local static_first_fail="" + local largest_static="" + local step_hint=1 + local previous_point="" + local low + local high + local mid + local probes=0 + local summary_file="$RESULTS_FOLDER/${test_name}_qps_${qps}_sla_summary.json" + + [[ "${ENABLE_ADAPTIVE_CONCURRENCY}" == "1" ]] || return 0 + [[ "${DRY_RUN:-0}" != "1" ]] || return 0 + + sorted_points=$(for point in $max_concurrency_list_raw; do printf '%s\n' "$point"; done | tr -d "'" | awk '/^[0-9]+$/' | sort -n | uniq) + [[ -n "$sorted_points" ]] || return 0 + + while read -r point; do + [[ -z "$point" ]] && continue + largest_static="$point" + evaluate_sla_from_json "$(result_json_path_for_serving "$test_name" "$qps" "$point")" + rc=$? + if (( rc == 0 )); then + static_last_pass="$point" + elif (( rc == 1 )); then + if [[ -n "$static_last_pass" ]]; then + static_first_fail="$point" + break + fi + fi + + if [[ -n "$previous_point" ]]; then + step_hint=$(( point - previous_point )) + if (( step_hint < 1 )); then step_hint=1; fi + fi + previous_point="$point" + done <<< "$sorted_points" + + if [[ -z "$static_last_pass" ]]; then + write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "" "$static_first_fail" "" "$static_first_fail" + return 0 + fi + + if [[ -n "$static_first_fail" ]]; then + low=$static_last_pass + high=$static_first_fail + while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do + mid=$(( (low + high) / 2 )) + probes=$(( probes + 1 )) + run_single_serving_probe \ + "$test_name" "$qps" "$mid" "$tp" \ + "$compilation_config_mode" "$optimization_level" \ + "$client_args_effective" "$client_remote_args" "$server_command" + rc=$? + if (( rc == 0 )); then + low=$mid + elif (( rc == 1 )); then + high=$mid + else + break + fi + done + write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "$static_first_fail" "$low" "$high" + return 0 + fi + + low=$largest_static + high="" + while (( probes < ADAPTIVE_MAX_PROBES )); do + point=$(( low + step_hint )) + if (( point > ADAPTIVE_MAX_CONCURRENCY )); then + point=$ADAPTIVE_MAX_CONCURRENCY + fi + (( point > low )) || break + probes=$(( probes + 1 )) + run_single_serving_probe \ + "$test_name" "$qps" "$point" "$tp" \ + "$compilation_config_mode" "$optimization_level" \ + "$client_args_effective" "$client_remote_args" "$server_command" + rc=$? + if (( rc == 0 )); then + low=$point + (( point == ADAPTIVE_MAX_CONCURRENCY )) && break + step_hint=$(( step_hint * 2 )) + if (( step_hint < 1 )); then step_hint=1; fi + elif (( rc == 1 )); then + high=$point + break + else + break + fi + done + + if [[ -n "$high" ]]; then + while (( low + 1 < high )) && (( probes < ADAPTIVE_MAX_PROBES )); do + mid=$(( (low + high) / 2 )) + probes=$(( probes + 1 )) + run_single_serving_probe \ + "$test_name" "$qps" "$mid" "$tp" \ + "$compilation_config_mode" "$optimization_level" \ + "$client_args_effective" "$client_remote_args" "$server_command" + rc=$? + if (( rc == 0 )); then + low=$mid + elif (( rc == 1 )); then + high=$mid + else + break + fi + done + fi + + write_adaptive_summary_json "$summary_file" "$test_name" "$qps" "$static_last_pass" "" "$low" "$high" +} + run_benchmark_tests() { # run benchmark tests using `vllm bench ` command # $1: test type (latency or throughput) @@ -252,37 +559,16 @@ run_benchmark_tests() { done } -run_latency_tests() { - run_benchmark_tests "latency" "$1" -} - -run_startup_tests() { - run_benchmark_tests "startup" "$1" -} - -run_throughput_tests() { - run_benchmark_tests "throughput" "$1" -} - -run_serving_tests() { - # run serving tests using `vllm bench serve` command - # $1: a json file specifying serving test cases - # - # Supported JSON formats: - # 1) Plain format: top-level array - # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] - # - # 2) Default parameters field + plain format tests - # { - # "defaults": { ... }, - # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] - # } +run_latency_tests() { run_benchmark_tests "latency" "$1"; } +run_startup_tests() { run_benchmark_tests "startup" "$1"; } +run_throughput_tests() { run_benchmark_tests "throughput" "$1"; } - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c ' +merge_serving_tests_stream() { + # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode. + # This helper does NOT modify JSON; it only filters the stream in dry-run mode. + local serving_test_file="$1" + # shellcheck disable=SC2016 + local merged=' if type == "array" then # Plain format: test cases array .[] @@ -304,7 +590,50 @@ run_serving_tests() { else error("Unsupported serving test file format: must be array or object with .tests") end - ' "$serving_test_file" | while read -r params; do + ' + + jq -c "$merged" "$serving_test_file" | \ + if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then + jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" ' + select((($model|length)==0) + or ((.server_parameters.model // "") == $model) + or ((.client_parameters.model // "") == $model)) + | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype)) + ' + else + cat + fi +} + +run_serving_tests() { + # run serving tests using `vllm bench serve` command + # $1: a json file specifying serving test cases + # + # Supported JSON formats: + # 1) Plain format: top-level array + # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] + # + # 2) Default parameters field + plain format tests + # { + # "defaults": { ... }, + # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] + # } + + local serving_test_file + serving_test_file=$1 + + # In dry-run mode, if filters are provided but no tests match, fail fast. + if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then + local count + count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ') + if [[ "$count" -eq 0 ]]; then + echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2 + return 0 + fi + fi + + # Iterate over serving tests (merged + optional filtered stream) + merge_serving_tests_stream "$serving_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. test_name=$(echo "$params" | jq -r '.test_name') if [[ ! "$test_name" =~ ^serving_ ]]; then @@ -323,10 +652,48 @@ run_serving_tests() { server_envs=$(echo "$params" | jq -r '.server_environment_variables') client_params=$(echo "$params" | jq -r '.client_parameters') - server_args=$(json2args "$server_params") + # vLLM serve CLI: model must be positional (no --model). Convert server_parameters accordingly. + server_model=$(echo "$server_params" | jq -r '.model // empty') + if [[ -z "$server_model" || "$server_model" == "null" ]]; then + echo "Error: serving test '$test_name' is missing server_parameters.model" >&2 + exit 1 + fi + server_params_no_model=$(echo "$server_params" | jq -c 'del(.model)') + server_args=$(json2args "$server_params_no_model") + server_envs=$(json2envs "$server_envs") client_args=$(json2args "$client_params") + # ------------------------------------------------------------ + # Option 1: Dynamic num-prompts scaling based on max_concurrency + # + # If PROMPTS_PER_CONCURRENCY is set, override JSON num_prompts with: + # num_prompts = max_concurrency * PROMPTS_PER_CONCURRENCY + # + # If PROMPTS_PER_CONCURRENCY is NOT set, keep JSON num_prompts behavior + # unchanged (i.e., whatever is in serving-tests-*.json). + # ------------------------------------------------------------ + PROMPTS_PER_CONCURRENCY="${PROMPTS_PER_CONCURRENCY-}" # no default on purpose + MIN_NUM_PROMPTS="${MIN_NUM_PROMPTS:-1}" + MAX_NUM_PROMPTS="${MAX_NUM_PROMPTS:-1000000}" + + if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then + # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates) + # Remove any fixed --num-prompts from JSON-derived args (avoid duplicates) + # Handles: --num-prompts 123 and --num-prompts=123 + client_args_no_np="$( + printf ' %s ' "$client_args" \ + | sed -E \ + -e 's/[[:space:]]--num-prompts=([^[:space:]]+)([[:space:]]|$)/ /g' \ + -e 's/[[:space:]]--num-prompts[[:space:]]+([^[:space:]]+)([[:space:]]|$)/ /g' + )" + # normalize whitespace + client_args_no_np="$(echo "$client_args_no_np" | tr -s ' ' | sed -E 's/^ //; s/ $//')" + client_args_no_np="$(echo "$client_args_no_np" | xargs)" + client_args_effective="$client_args_no_np" + else + client_args_effective="$client_args" + fi # qps_list qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') @@ -358,14 +725,13 @@ run_serving_tests() { fi # check if server model and client model is aligned - server_model=$(echo "$server_params" | jq -r '.model') client_model=$(echo "$client_params" | jq -r '.model') if [[ $server_model != "$client_model" ]]; then echo "Server model and client model must be the same. Skip testcase $test_name." continue fi - server_command="$server_envs vllm serve \ + server_command="$server_envs vllm serve $server_model \ $server_args" # run the server @@ -373,7 +739,7 @@ run_serving_tests() { echo "Server command: $server_command" # support remote vllm server client_remote_args="" - if [[ -z "${REMOTE_HOST}" ]]; then + if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then bash -c "$server_command" & server_pid=$! # wait until the server is alive @@ -384,6 +750,9 @@ run_serving_tests() { echo "" echo "vLLM failed to start within the timeout period." fi + elif [[ "${DRY_RUN:-0}" == "1" ]]; then + # dry-run: don't start server + echo "Dry Run." else server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT" if [[ ${REMOTE_PORT} ]]; then @@ -402,15 +771,21 @@ run_serving_tests() { for qps in $qps_list; do # remove the surrounding single quote from qps if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" qps="inf" - echo "now qps is $qps" fi # iterate over different max_concurrency for max_concurrency in $max_concurrency_list; do - new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency + new_test_name="${test_name}_qps_${qps}_concurrency_${max_concurrency}" echo " new test name $new_test_name" + # If PROMPTS_PER_CONCURRENCY is set, compute per-concurrency --num-prompts. + num_prompts_arg="" + if [[ -n "${PROMPTS_PER_CONCURRENCY}" ]]; then + num_prompts=$(( max_concurrency * PROMPTS_PER_CONCURRENCY )) + if (( num_prompts < MIN_NUM_PROMPTS )); then num_prompts=$MIN_NUM_PROMPTS; fi + if (( num_prompts > MAX_NUM_PROMPTS )); then num_prompts=$MAX_NUM_PROMPTS; fi + num_prompts_arg="--num-prompts $num_prompts" + fi # pass the tensor parallel size, the compilation mode, and the optimization # level to the client so that they can be used on the benchmark dashboard client_command="vllm bench serve \ @@ -419,13 +794,16 @@ run_serving_tests() { --result-filename ${new_test_name}.json \ --request-rate $qps \ --max-concurrency $max_concurrency \ + $num_prompts_arg \ --metadata tensor_parallel_size=$tp compilation_config.mode=$compilation_config_mode optimization_level=$optimization_level \ - $client_args $client_remote_args " + $client_args_effective $client_remote_args " echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" - bash -c "$client_command" + if [[ "${DRY_RUN:-0}" != "1" ]]; then + bash -c "$client_command" + fi # record the benchmarking commands jq_output=$(jq -n \ @@ -440,15 +818,23 @@ run_serving_tests() { echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" done + + adaptive_refine_from_static_results \ + "$test_name" "$qps" "$max_concurrency_list" "$tp" \ + "$compilation_config_mode" "$optimization_level" \ + "$client_args_effective" "$client_remote_args" "$server_command" done # clean up - kill -9 $server_pid - kill_gpu_processes + if [[ "${DRY_RUN:-0}" != "1" ]]; then + kill -9 "$server_pid" + kill_gpu_processes + fi done } main() { + local ARCH ARCH='' if [[ "$ON_CPU" == "1" ]]; then @@ -458,7 +844,13 @@ main() { check_gpus ARCH="$arch_suffix" fi - check_hf_token + + # DRY_RUN does not execute vLLM; do not require HF_TOKEN. + if [[ "${DRY_RUN:-0}" != "1" ]]; then + check_hf_token + else + echo "DRY_RUN=1 -> skip HF_TOKEN validation" + fi # dependencies (which wget && which curl) || (apt-get update && apt-get install -y wget curl) @@ -479,11 +871,16 @@ main() { # dump vllm info via vllm collect-env env_output=$(vllm collect-env) - echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt" # benchmarking - run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" + run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $? + + if [[ "${DRY_RUN:-0}" == "1" ]]; then + echo "DRY_RUN=1 -> skip latency/startup/throughput suites" + exit 0 + fi + run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}" run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" @@ -491,6 +888,7 @@ main() { # postprocess benchmarking results pip install tabulate pandas python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py + python3 $QUICK_BENCHMARK_ROOT/scripts/compare-json-results.py -f $RESULTS_FOLDER/benchmark_results.json upload_to_buildkite } diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json index 296380f72a668b8ce41dc55379d2841d2fd70744..3b3fb4bed8018da034a65133606c5c815539101a 100644 --- a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json +++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json @@ -51,5 +51,56 @@ "max-model-len": 256, "async-scheduling": "" } + }, + { + "test_name": "latency_deepseek_r1", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "tensor_parallel_size": 8, + "load_format": "dummy", + "max-model-len": 2048, + "dtype": "bfloat16" + } + }, + { + "test_name": "latency_llama4_maverick_17b128e_instruct_fp8", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "max-model-len": 512, + "max-num-seqs": 128, + "async-scheduling": "", + "gpu-memory-utilization": 0.95, + "enable_expert_parallel": "" + } + }, + { + "test_name": "latency_qwen3_8b", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "max-model-len": 2048, + "max-num-seqs": 128, + "dtype": "bfloat16", + "async-scheduling": "" + } } ] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json new file mode 100644 index 0000000000000000000000000000000000000000..f0dc3d5ec067a9770e7637d532a0580f0cbeaf3e --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-asr.json @@ -0,0 +1,37 @@ +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120 + }, + "server_parameters": { + "dtype": "bfloat16", + "model": "openai/whisper-large-v3-turbo" + }, + "client_parameters": { + "model": "openai/whisper-large-v3-turbo", + "backend": "openai-audio", + "endpoint": "/v1/audio/transcriptions", + "dataset_name": "hf", + "dataset_path": "openslr/librispeech_asr", + "hf_subset": "clean", + "hf_split": "test", + "no_stream": "", + "no_oversample": "", + "num_prompts": 200 + } + }, + "tests": [ + { + "test_name": "serving_whisper_large_v3_turbo_librispeech_clean_tp1", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": {} + } + ] +} diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json new file mode 100644 index 0000000000000000000000000000000000000000..6d3455c478ca0a9917535b4678de8ee1537999a4 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json @@ -0,0 +1,41 @@ +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [ + 32, + 64, + 128 + ], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "dtype": "bfloat16", + "model": "jinaai/jina-embeddings-v3", + "trust_remote_code": "" + }, + "client_parameters": { + "model": "jinaai/jina-embeddings-v3", + "backend": "openai-embeddings", + "endpoint": "/v1/embeddings", + "dataset_name": "sharegpt", + "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + "tests": [ + { + "test_name": "serving_jina_embed_v3_tp1_sharegpt", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": {} + } + ] +} diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json new file mode 100644 index 0000000000000000000000000000000000000000..0411b04e1bd5f47f4ec82154ea2bc078e605305c --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json @@ -0,0 +1,355 @@ +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256 + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "ignore-eos": "", + "num_prompts": 200 + } + }, + "tests": [ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp4_random_128_128", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp4_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp1_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp4_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp1_random_2048_2048", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp2_random_2048_2048", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp4_random_2048_2048", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_int4_tp1_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 2 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int4_tp4_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 4 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int8_tp1_random_128_128", + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int8_tp2_random_128_128", + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 2 + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int8_tp4_random_128_128", + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4 + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama3B_tp1_random_128_128", + "server_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_granite2B_tp1_random_128_128", + "server_parameters": { + "model": "ibm-granite/granite-3.2-2b-instruct", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "ibm-granite/granite-3.2-2b-instruct", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen1.7B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-1.7B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-1.7B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen4B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-4B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-4B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen8B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-8B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_glm9B_tp1_random_128_128", + "server_parameters": { + "model": "zai-org/glm-4-9b-hf", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "zai-org/glm-4-9b-hf", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_gemma7B_tp1_random_128_128", + "server_parameters": { + "model": "google/gemma-7b", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "google/gemma-7b", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + } + ] +} diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json index 25ed7415ec0e48b65e19123493aff4a9977a2296..f66ef2af4bd655b0308976b637dd2b7654015deb 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json @@ -72,17 +72,6 @@ "random-output-len": 128 } }, - { - "test_name": "serving_llama8B_tp4_random_128_128", - "server_parameters": { - "tensor_parallel_size": 4 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, { "test_name": "serving_llama8B_tp1_random_128_2048", "server_parameters": { @@ -105,17 +94,6 @@ "random-output-len": 2048 } }, - { - "test_name": "serving_llama8B_tp4_random_128_2048", - "server_parameters": { - "tensor_parallel_size": 4 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 2048 - } - }, { "test_name": "serving_llama8B_tp1_random_2048_128", "server_parameters": { @@ -139,144 +117,25 @@ } }, { - "test_name": "serving_llama8B_tp4_random_2048_128", - "server_parameters": { - "tensor_parallel_size": 4 - }, - "client_parameters": { - "dataset_name": "random", - "random-input-len": 2048, - "random-output-len": 128 - } - }, - { - "test_name": "serving_llama8B_int4_tp1_random_128_128", + "test_name": "serving_llama8B_tp1_random_2048_2048", "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "tensor_parallel_size": 1 }, "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 + "random-input-len": 2048, + "random-output-len": 2048 } }, { - "test_name": "serving_llama8B_int4_tp2_random_128_128", + "test_name": "serving_llama8B_tp2_random_2048_2048", "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", "tensor_parallel_size": 2 }, "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_llama8B_int4_tp4_random_128_128", - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 4 - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_llama3B_tp1_random_128_128", - "server_parameters": { - "model": "meta-llama/Llama-3.2-3B-Instruct", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "meta-llama/Llama-3.2-3B-Instruct", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_granite2B_tp1_random_128_128", - "server_parameters": { - "model": "ibm-granite/granite-3.2-2b-instruct", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "ibm-granite/granite-3.2-2b-instruct", "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_qwen1.7B_tp1_random_128_128", - "server_parameters": { - "model": "Qwen/Qwen3-1.7B", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "Qwen/Qwen3-1.7B", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_qwen4B_tp1_random_128_128", - "server_parameters": { - "model": "Qwen/Qwen3-4B", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "Qwen/Qwen3-4B", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_qwen8B_tp1_random_128_128", - "server_parameters": { - "model": "Qwen/Qwen3-8B", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "Qwen/Qwen3-8B", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_glm9B_tp1_random_128_128", - "server_parameters": { - "model": "zai-org/glm-4-9b-hf", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "zai-org/glm-4-9b-hf", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_gemma7B_tp1_random_128_128", - "server_parameters": { - "model": "google/gemma-7b", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "google/gemma-7b", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 + "random-input-len": 2048, + "random-output-len": 2048 } } ] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json index 8c6b34bd9fa33367a020888fddfe0fc3a5ad2108..3929aa5fbbe0d9b4d8e66064cf5ab7cb1242658f 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json @@ -10,7 +10,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy", "max-model-len": 2048, @@ -37,7 +36,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy", "max-model-len": 2048, @@ -64,7 +62,6 @@ "server_parameters": { "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "tensor_parallel_size": 2, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy", "max-model-len": 2048, @@ -78,5 +75,83 @@ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } + }, + { + "test_name": "serving_deepseek_r1", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "tensor_parallel_size": 8, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 200, + "async-scheduling": "", + "dtype": "bfloat16" + }, + "client_parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama4_maverick_17b128e_instruct_fp8", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "disable_log_stats": "", + "max-model-len": 2048, + "max-num-seqs": 128, + "async-scheduling": "", + "enable_expert_parallel": "", + "max-num-batched-tokens": 4096 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_qwen3_8b", + "qps_list": [1, 4, 10, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "Qwen/Qwen-3-8B", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "disable_log_stats": "", + "async-scheduling": "" + }, + "client_parameters": { + "model": "Qwen/Qwen-3-8B", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } } ] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests.json b/.buildkite/performance-benchmarks/tests/serving-tests.json index a6d4141d5c2dcb28b3ae0172fe781e36c5699708..66d52abc1206fc9bb7f1ef143c84c2728a5003f0 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests.json @@ -5,7 +5,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy" }, @@ -23,7 +22,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy" }, @@ -41,7 +39,6 @@ "server_parameters": { "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "tensor_parallel_size": 2, - "swap_space": 16, "disable_log_stats": "", "load_format": "dummy" }, @@ -59,7 +56,6 @@ "server_parameters": { "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, - "swap_space": 16, "speculative_config": { "model": "turboderp/Qwama-0.5B-Instruct", "num_speculative_tokens": 4, diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json index 3127bf2f6bce376906f419b46424134c86bd97ff..25344348bb39e63569795aef0d80b13180639a51 100644 --- a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json +++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json @@ -57,5 +57,67 @@ "max-num-seqs": 512, "async-scheduling": "" } + }, + { + "test_name": "throughput_deepseek_r1", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "tensor_parallel_size": 8, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "dataset_name": "sharegpt", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 384, + "async-scheduling": "" + } + }, + { + "test_name": "throughput_llama4_maverick_17b128e_instruct_fp8", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "dataset_name": "sharegpt", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "", + "enable_expert_parallel": "" + } + }, + { + "test_name": "throughput_qwen3_8b", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "Qwen/Qwen-3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "dataset_name": "sharegpt", + "num_prompts": 1000, + "max-num-seqs": 512, + "backend": "vllm", + "async-scheduling": "" + } } ] diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 5dc360c544dbb3527471d11e896f06db768d0098..16ecc515862eb3ed33eca66363fcf83fe7a9397e 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -83,7 +83,7 @@ steps: agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" @@ -152,7 +152,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" env: diff --git a/.buildkite/scripts/annotate-rocm-release.sh b/.buildkite/scripts/annotate-rocm-release.sh index 8e7dbfb9e13dc44634b20faf29f9c863142fedb2..8a5b344407cc4716020a634d996a563780c7a7f8 100755 --- a/.buildkite/scripts/annotate-rocm-release.sh +++ b/.buildkite/scripts/annotate-rocm-release.sh @@ -25,7 +25,7 @@ S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}" S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com" # Format ROCm version for path (e.g., "7.1" -> "rocm710") -ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')" +ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')" ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}" buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF ## ROCm Wheel and Docker Image Releases @@ -68,7 +68,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl . aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl . aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl . -aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl . +aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl . aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl . \`\`\` @@ -80,7 +80,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash- - **torchvision**: TorchVision for ROCm PyTorch - **torchaudio**: Torchaudio for ROCm PyTorch - **amdsmi**: AMD SMI Python bindings -- **aiter**: Aiter for ROCm +- **amd_aiter**: Aiter for ROCm - **flash-attn**: Flash Attention for ROCm ### :warning: Notes diff --git a/.buildkite/scripts/cache-rocm-base-wheels.sh b/.buildkite/scripts/cache-rocm-base-wheels.sh index be244725023da4640d41ffb9c11b3a6588e7a8a2..060d09db49d3b37aacdd38843f423dd986baa752 100755 --- a/.buildkite/scripts/cache-rocm-base-wheels.sh +++ b/.buildkite/scripts/cache-rocm-base-wheels.sh @@ -83,7 +83,7 @@ case "${1:-}" in exit 1 fi - WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l) + WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l) if [[ "$WHEEL_COUNT" -eq 0 ]]; then echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2 exit 1 @@ -110,9 +110,9 @@ case "${1:-}" in echo "" echo "Downloaded wheels:" - ls -lh artifacts/rocm-base-wheels/ + find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \; - WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l) + WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l) echo "" echo "Total: $WHEEL_COUNT wheels" echo "========================================" diff --git a/.buildkite/scripts/check-ray-compatibility.sh b/.buildkite/scripts/check-ray-compatibility.sh new file mode 100644 index 0000000000000000000000000000000000000000..d44d074c2001a8475516fa715960dff250bedbc1 --- /dev/null +++ b/.buildkite/scripts/check-ray-compatibility.sh @@ -0,0 +1,213 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Check if Ray LLM can generate lock files that are compatible with this +# version of vllm. Downloads Ray's requirement files and runs a full +# dependency resolution with the installed vllm's constraints to see if +# a valid lock file can be produced. +# +# See: https://github.com/vllm-project/vllm/issues/33599 + +set -eo pipefail + +RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python" + +WORK_DIR=$(mktemp -d) +trap 'rm -rf "$WORK_DIR"' EXIT + +# Fetch all Ray requirement files used in the LLM depset pipeline +echo ">>> Fetching Ray requirement files" +RAY_FILES=( + "requirements.txt" + "requirements/cloud-requirements.txt" + "requirements/base-test-requirements.txt" + "requirements/llm/llm-requirements.txt" + "requirements/llm/llm-test-requirements.txt" +) +for FILE in "${RAY_FILES[@]}"; do + LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")" + echo " ${FILE}" + curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}" +done + +# Extract installed vllm deps +echo ">>> Extracting installed vllm dependency constraints" +python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF' +"""Write out the installed vllm's dependencies as pip constraint lines. + +Ray uses vllm[audio], so audio-extra deps are included with their extra +markers stripped. The resolver cannot evaluate extra markers for a +package that is not itself being resolved from an index, so we activate +them manually here. +""" +import importlib.metadata +import re +import sys + +out_path = sys.argv[1] +raw_reqs = importlib.metadata.requires("vllm") or [] + +# Ray uses vllm[audio] – activate that extra. +ACTIVE_EXTRAS = {"audio"} +EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""") + +lines = [] +for r in raw_reqs: + if ";" not in r: + # Unconditional dep — always include. + lines.append(r.strip()) + continue + + req_part, _, marker_part = r.partition(";") + marker_part = marker_part.strip() + + extra_matches = EXTRA_RE.findall(marker_part) + if not extra_matches: + # Non-extra marker (python_version, etc.) — keep as-is. + lines.append(r.strip()) + continue + + if not ACTIVE_EXTRAS.intersection(extra_matches): + continue # Skip inactive extras (tensorizer, bench, …). + + # Strip the extra== conditions but keep any remaining markers + # (e.g. python_version). + cleaned = EXTRA_RE.sub("", marker_part) + cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned) + cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip() + + if cleaned: + lines.append(f"{req_part.strip()} ; {cleaned}") + else: + lines.append(req_part.strip()) + +with open(out_path, "w") as f: + for line in lines: + f.write(line + "\n") + +print(f"Wrote {len(lines)} constraints to {out_path}") +PYEOF + +echo ">>> Installed vllm deps (first 20 lines):" +head -20 "${WORK_DIR}/vllm-constraints.txt" + +# Remove Ray's vllm pin — the installed vllm's transitive deps +# (written above) replace it in the resolution. vllm itself cannot +# be resolved from PyPI for in-development versions, so we test +# whether Ray's requirements can coexist with vllm's dependency +# constraints instead. +sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt" + +# Install uv if needed +if ! command -v uv &>/dev/null; then + echo ">>> Installing uv" + pip install uv -q +fi + +# Resolve: given vllm's constraints, can Ray compile a lock file? +# +# vllm's dependency constraints are the fixed side — Ray is flexible and +# can regenerate its lock files. We pass vllm's constraints via -c so +# the resolver treats them as non-negotiable bounds, then check whether +# Ray's own requirements can still be satisfied within those bounds. +echo "" +echo "============================================================" +echo ">>> Resolving: Can Ray generate compatible lock files?" +echo "============================================================" + +set +e +uv pip compile \ + "${WORK_DIR}/requirements.txt" \ + "${WORK_DIR}/cloud-requirements.txt" \ + "${WORK_DIR}/base-test-requirements.txt" \ + "${WORK_DIR}/llm-requirements.txt" \ + "${WORK_DIR}/llm-test-requirements.txt" \ + -c "${WORK_DIR}/vllm-constraints.txt" \ + --python-version 3.12 \ + --python-platform x86_64-manylinux_2_31 \ + --extra-index-url https://download.pytorch.org/whl/cu129 \ + --index-strategy unsafe-best-match \ + --unsafe-package setuptools \ + --unsafe-package ray \ + --no-header \ + -o "${WORK_DIR}/resolved.txt" \ + 2>&1 +EXIT_CODE=$? +set -e + +echo "" +echo "==========================================" +if [ $EXIT_CODE -eq 0 ]; then + echo "SUCCESS: Ray can generate lock files compatible with this vllm." + echo "" + echo "Key resolved versions:" + grep -E '^(protobuf|torch|numpy|transformers)==' \ + "${WORK_DIR}/resolved.txt" | sort || true + echo "==========================================" + exit 0 +fi + +echo "FAILURE: Ray cannot generate lock files compatible with this vllm." +echo "This means a fundamental dependency conflict exists that Ray" +echo "cannot resolve by regenerating its lock files." +echo "See: https://github.com/vllm-project/vllm/issues/33599" +echo "==========================================" + +# Buildkite annotation +if [ -f /usr/bin/buildkite-agent ]; then + buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF +### :warning: Ray Dependency Compatibility Warning +This PR introduces dependencies that **cannot** be resolved with Ray's requirements. +Ray would not be able to regenerate its lock files to accommodate this vllm version. + +Please check the **Ray Dependency Compatibility Check** step logs for details. +See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context. +EOF +fi + +# Notify Slack if webhook is configured and PR/branch are valid. +if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then + PR="${BUILDKITE_PULL_REQUEST:-}" + BRANCH="${BUILDKITE_BRANCH:-}" + + # Skip notification if PR is invalid or branch is empty + if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then + echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)" + else + echo ">>> Sending Slack notification" + # Single quotes are intentional: the f-string expressions are Python, not shell. + # shellcheck disable=SC2016 + PAYLOAD=$(python3 -c ' +import json, os, sys +pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A") +branch = os.getenv("BUILDKITE_BRANCH", "unknown") +url = os.getenv("BUILDKITE_BUILD_URL", "#") +data = { + "text": ":warning: Ray Dependency Compatibility Check Failed", + "blocks": [{ + "type": "section", + "text": { + "type": "mrkdwn", + "text": ( + "*:warning: Ray Dependency Compatibility Check Failed*\n" + f"PR #{pr} on branch `{branch}` introduces dependencies " + f"that cannot be resolved with Ray'\''s requirements.\n" + f"<{url}|View Build>" + ), + }, + }], +} +print(json.dumps(data)) +') + + HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \ + -H 'Content-type: application/json' \ + -d "$PAYLOAD") + echo " Slack webhook response: $HTTP_CODE" + fi +else + echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)" +fi + +exit 1 diff --git a/.buildkite/scripts/cherry-pick-from-milestone.sh b/.buildkite/scripts/cherry-pick-from-milestone.sh index 99eb36acd1525461070b8c688aa9197dc343ea82..67f30930bf41d6406ef4d1a924d86bc598addef9 100755 --- a/.buildkite/scripts/cherry-pick-from-milestone.sh +++ b/.buildkite/scripts/cherry-pick-from-milestone.sh @@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..." # Store PR data in a temp file PR_DATA=$(mktemp) -trap "rm -f $PR_DATA" EXIT +trap 'rm -f "$PR_DATA"' EXIT if ! gh pr list --state merged --search "milestone:${MILESTONE}" \ --limit 1000 \ diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index f36909396675f2c477fef8adae79b04960629295..1c43c404d247c6159c4af275ebcac2821c685737 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -1,25 +1,57 @@ #!/bin/bash -# This script runs test inside the corresponding ROCm docker container. +# This script runs tests inside the corresponding ROCm docker container. +# It handles both single-node and multi-node test configurations. +# +# Multi-node detection: Instead of matching on fragile group names, we detect +# multi-node jobs structurally by looking for the bracket command syntax +# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable. +# +############################################################################### +# QUOTING / COMMAND PASSING +# +# Passing commands as positional arguments ($*) is fragile when the command +# string itself contains double quotes, e.g.: +# +# bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow"" +# +# The outer shell resolves the nested quotes *before* this script runs, so +# the script receives mangled input it cannot fully recover. +# +# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable: +# +# export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"' +# bash run-amd-test.sh +# +# Single-quoted assignment preserves all inner double quotes verbatim. +# The $* path is kept for backward compatibility but callers should migrate. +############################################################################### set -o pipefail # Export Python path export PYTHONPATH=".." -# Print ROCm version -echo "--- Confirming Clean Initial State" -while true; do - sleep 3 - if grep -q clean /opt/amdgpu/etc/gpu_state; then - echo "GPUs state is \"clean\"" - break - fi -done - -echo "--- ROCm info" -rocminfo +############################################################################### +# Helper Functions +############################################################################### + +wait_for_clean_gpus() { + local timeout=${1:-300} + local start=$SECONDS + echo "--- Waiting for clean GPU state (timeout: ${timeout}s)" + while true; do + if grep -q clean /opt/amdgpu/etc/gpu_state; then + echo "GPUs state is \"clean\"" + return + fi + if (( SECONDS - start >= timeout )); then + echo "Error: GPUs did not reach clean state within ${timeout}s" >&2 + exit 1 + fi + sleep 3 + done +} -# cleanup older docker images cleanup_docker() { # Get Docker's root directory docker_root=$(docker info -f '{{.DockerRootDir}}') @@ -28,15 +60,12 @@ cleanup_docker() { exit 1 fi echo "Docker root directory: $docker_root" - # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') - # Define the threshold threshold=70 if [ "$disk_usage" -gt "$threshold" ]; then echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." - # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f - # Remove unused volumes / force the system prune for old images as well. docker volume prune -f && docker system prune --force --filter "until=72h" --all echo "Docker images and volumes cleanup completed." else @@ -45,193 +74,445 @@ cleanup_docker() { } cleanup_network() { - for node in $(seq 0 $((NUM_NODES-1))); do - if docker pr -a -q -f name="node${node}" | grep -q .; then - docker stop "node${node}" + local max_nodes=${NUM_NODES:-2} + for node in $(seq 0 $((max_nodes - 1))); do + if docker ps -a -q -f name="node${node}" | grep -q .; then + docker stop "node${node}" || true fi done - if docker network ls | grep docker-net; then - docker network rm docker-net + if docker network ls | grep -q docker-net; then + docker network rm docker-net || true + fi +} + +is_multi_node() { + local cmds="$1" + # Primary signal: NUM_NODES environment variable set by the pipeline + if [[ "${NUM_NODES:-1}" -gt 1 ]]; then + return 0 + fi + # Fallback: detect the bracket syntax structurally + # Pattern: [...] && [...] (per-node command arrays) + if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then + return 0 + fi + return 1 +} + +handle_pytest_exit() { + local exit_code=$1 + if [ "$exit_code" -eq 5 ]; then + echo "Pytest exit code 5 (no tests collected) - treating as success." + exit 0 fi + exit "$exit_code" } -# Call the cleanup docker function +############################################################################### +# Pytest marker/keyword re-quoting +# +# When commands are passed through Buildkite -> shell -> $* -> bash -c, +# quotes around multi-word pytest -m/-k expressions get stripped: +# pytest -v -s -m 'not cpu_test' v1/core +# becomes: +# pytest -v -s -m not cpu_test v1/core +# +# pytest then interprets "cpu_test" as a file path, not part of the marker. +# +# This function detects unquoted expressions after -m/-k and re-quotes them +# by collecting tokens until a recognizable boundary is reached: +# - test path (contains '/') +# - test file (ends with '.py') +# - another pytest flag (--xxx or -x single-char flags) +# - command separator (&& || ; |) +# - environment variable assignment (FOO=bar) +# +# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through +# unquoted since they have no spaces and work fine. +# +# Already-quoted expressions (containing literal single quotes) are passed +# through untouched to avoid double-quoting values injected by +# apply_rocm_test_overrides. +# +# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner +# double-quotes stripped by the calling shell (see header comment). +# Use VLLM_TEST_COMMANDS to avoid the problem entirely. +############################################################################### +re_quote_pytest_markers() { + local input="$1" + local output="" + local collecting=false + local marker_buf="" + + # Strip backslash-newline continuations, then flatten remaining newlines + local flat="${input//$'\\\n'/ }" + flat="${flat//$'\n'/ }" + + # Disable globbing to prevent *.py etc. from expanding during read -ra + local restore_glob + restore_glob="$(shopt -p -o noglob 2>/dev/null || true)" + set -o noglob + local -a words + read -ra words <<< "$flat" + eval "$restore_glob" + + for word in "${words[@]}"; do + if $collecting; then + # If the token we're about to collect already contains a literal + # single quote, the expression was already quoted upstream. + # Flush and stop collecting. + if [[ "$word" == *"'"* ]]; then + if [[ -n "$marker_buf" ]]; then + # Should not normally happen (partial buf + quote), flush raw + output+="${marker_buf} " + marker_buf="" + fi + output+="${word} " + collecting=false + continue + fi + + local is_boundary=false + case "$word" in + # Line-continuation artifact + "\\") + is_boundary=true ;; + # Command separators + "&&"|"||"|";"|"|") + is_boundary=true ;; + # Long flags (--ignore, --shard-id, etc.) + --*) + is_boundary=true ;; + # Short flags (-v, -s, -x, etc.) but NOT negative marker tokens + # like "not" which don't start with "-". Also skip -k/-m which + # would start a new marker (handled below). + -[a-zA-Z]) + is_boundary=true ;; + # Test path (contains /) + */*) + is_boundary=true ;; + # Test file (ends with .py, possibly with ::method) + *.py|*.py::*) + is_boundary=true ;; + # Environment variable assignment preceding a command (FOO=bar) + *=*) + # Only treat as boundary if it looks like VAR=value, not + # pytest filter expressions like num_gpus=2 inside markers + if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then + is_boundary=true + fi + ;; + esac + + if $is_boundary; then + # Strip surrounding double quotes if present (from upstream + # single-to-double conversion); without this, wrapping below + # would produce '"expr"' with literal double-quote characters. + if [[ "$marker_buf" == '"'*'"' ]]; then + marker_buf="${marker_buf#\"}" + marker_buf="${marker_buf%\"}" + fi + # Flush the collected marker expression + if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then + output+="'${marker_buf}' " + else + output+="${marker_buf} " + fi + collecting=false + marker_buf="" + # Check if this boundary word itself starts a new -m/-k + if [[ "$word" == "-m" || "$word" == "-k" ]]; then + output+="${word} " + collecting=true + # Drop stray backslash tokens silently + elif [[ "$word" == "\\" ]]; then + : + else + output+="${word} " + fi + else + # Accumulate into marker buffer + if [[ -n "$marker_buf" ]]; then + marker_buf+=" ${word}" + else + marker_buf="${word}" + fi + fi + elif [[ "$word" == "-m" || "$word" == "-k" ]]; then + output+="${word} " + collecting=true + marker_buf="" + else + output+="${word} " + fi + done + + # Flush any trailing marker expression (marker at end of command) + if $collecting && [[ -n "$marker_buf" ]]; then + # Strip surrounding double quotes (see mid-stream flush comment) + if [[ "$marker_buf" == '"'*'"' ]]; then + marker_buf="${marker_buf#\"}" + marker_buf="${marker_buf%\"}" + fi + if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then + output+="'${marker_buf}'" + else + output+="${marker_buf}" + fi + fi + + echo "${output% }" +} + +############################################################################### +# ROCm-specific pytest command rewrites +# +# These apply ignore flags and environment overrides for tests that are not +# yet supported or behave differently on ROCm hardware. Kept as a single +# function so new exclusions are easy to add in one place. +############################################################################### + +apply_rocm_test_overrides() { + local cmds="$1" + + # --- Model registry filter --- + if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then + cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} + fi + + # --- LoRA: disable custom paged attention --- + if [[ $cmds == *"pytest -v -s lora"* ]]; then + cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"} + fi + + # --- Kernel ignores --- + if [[ $cmds == *" kernels/core"* ]]; then + cmds="${cmds} \ + --ignore=kernels/core/test_fused_quant_layernorm.py \ + --ignore=kernels/core/test_permute_cols.py" + fi + + if [[ $cmds == *" kernels/attention"* ]]; then + cmds="${cmds} \ + --ignore=kernels/attention/test_attention_selector.py \ + --ignore=kernels/attention/test_encoder_decoder_attn.py \ + --ignore=kernels/attention/test_flash_attn.py \ + --ignore=kernels/attention/test_flashinfer.py \ + --ignore=kernels/attention/test_prefix_prefill.py \ + --ignore=kernels/attention/test_cascade_flash_attn.py \ + --ignore=kernels/attention/test_mha_attn.py \ + --ignore=kernels/attention/test_lightning_attn.py \ + --ignore=kernels/attention/test_attention.py" + fi + + if [[ $cmds == *" kernels/quantization"* ]]; then + cmds="${cmds} \ + --ignore=kernels/quantization/test_int8_quant.py \ + --ignore=kernels/quantization/test_machete_mm.py \ + --ignore=kernels/quantization/test_block_fp8.py \ + --ignore=kernels/quantization/test_block_int8.py \ + --ignore=kernels/quantization/test_marlin_gemm.py \ + --ignore=kernels/quantization/test_cutlass_scaled_mm.py \ + --ignore=kernels/quantization/test_int8_kernel.py" + fi + + if [[ $cmds == *" kernels/mamba"* ]]; then + cmds="${cmds} \ + --ignore=kernels/mamba/test_mamba_mixer2.py \ + --ignore=kernels/mamba/test_causal_conv1d.py \ + --ignore=kernels/mamba/test_mamba_ssm_ssd.py" + fi + + if [[ $cmds == *" kernels/moe"* ]]; then + cmds="${cmds} \ + --ignore=kernels/moe/test_moe.py \ + --ignore=kernels/moe/test_cutlass_moe.py \ + --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py" + fi + + # --- Entrypoint ignores --- + if [[ $cmds == *" entrypoints/openai "* ]]; then + cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \ + --ignore=entrypoints/openai/test_audio.py \ + --ignore=entrypoints/openai/test_shutdown.py \ + --ignore=entrypoints/openai/test_completion.py \ + --ignore=entrypoints/openai/test_models.py \ + --ignore=entrypoints/openai/test_lora_adapters.py \ + --ignore=entrypoints/openai/test_return_tokens_as_ids.py \ + --ignore=entrypoints/openai/test_root_path.py \ + --ignore=entrypoints/openai/test_tokenization.py \ + --ignore=entrypoints/openai/test_prompt_validation.py "} + fi + + if [[ $cmds == *" entrypoints/llm "* ]]; then + cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \ + --ignore=entrypoints/llm/test_chat.py \ + --ignore=entrypoints/llm/test_accuracy.py \ + --ignore=entrypoints/llm/test_init.py \ + --ignore=entrypoints/llm/test_prompt_validation.py "} + fi + + # Clean up escaped newlines from --ignore appends + cmds=$(echo "$cmds" | sed 's/ \\ / /g') + + echo "$cmds" +} + +############################################################################### +# Main +############################################################################### + +# --- GPU initialization --- +echo "--- Confirming Clean Initial State" +wait_for_clean_gpus + +echo "--- ROCm info" +rocminfo + +# --- Docker housekeeping --- cleanup_docker echo "--- Resetting GPUs" - echo "reset" > /opt/amdgpu/etc/gpu_state +wait_for_clean_gpus -while true; do - sleep 3 - if grep -q clean /opt/amdgpu/etc/gpu_state; then - echo "GPUs state is \"clean\"" - break - fi -done - +# --- Pull test image --- echo "--- Pulling container" image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" docker pull "${image_name}" remove_docker_container() { - docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true + docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true } trap remove_docker_container EXIT +# --- Prepare commands --- echo "--- Running container" HF_CACHE="$(realpath ~)/huggingface" mkdir -p "${HF_CACHE}" HF_MOUNT="/root/.cache/huggingface" -commands=$@ -echo "Raw commands: $commands" - -commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"} - -if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then - commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} -fi - -commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"} - -if [[ $commands == *"pytest -v -s lora"* ]]; then - commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"} -fi - -#ignore certain kernels tests -if [[ $commands == *" kernels/core"* ]]; then - commands="${commands} \ - --ignore=kernels/core/test_fused_quant_layernorm.py \ - --ignore=kernels/core/test_permute_cols.py" -fi - -if [[ $commands == *" kernels/attention"* ]]; then - commands="${commands} \ - --ignore=kernels/attention/test_attention_selector.py \ - --ignore=kernels/attention/test_encoder_decoder_attn.py \ - --ignore=kernels/attention/test_flash_attn.py \ - --ignore=kernels/attention/test_flashinfer.py \ - --ignore=kernels/attention/test_prefix_prefill.py \ - --ignore=kernels/attention/test_cascade_flash_attn.py \ - --ignore=kernels/attention/test_mha_attn.py \ - --ignore=kernels/attention/test_lightning_attn.py \ - --ignore=kernels/attention/test_attention.py" -fi - -if [[ $commands == *" kernels/quantization"* ]]; then - commands="${commands} \ - --ignore=kernels/quantization/test_int8_quant.py \ - --ignore=kernels/quantization/test_machete_mm.py \ - --ignore=kernels/quantization/test_block_fp8.py \ - --ignore=kernels/quantization/test_block_int8.py \ - --ignore=kernels/quantization/test_marlin_gemm.py \ - --ignore=kernels/quantization/test_cutlass_scaled_mm.py \ - --ignore=kernels/quantization/test_int8_kernel.py" -fi - -if [[ $commands == *" kernels/mamba"* ]]; then - commands="${commands} \ - --ignore=kernels/mamba/test_mamba_mixer2.py \ - --ignore=kernels/mamba/test_causal_conv1d.py \ - --ignore=kernels/mamba/test_mamba_ssm_ssd.py" -fi - -if [[ $commands == *" kernels/moe"* ]]; then - commands="${commands} \ - --ignore=kernels/moe/test_moe.py \ - --ignore=kernels/moe/test_cutlass_moe.py \ - --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py" +# ---- Command source selection ---- +# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact). +# Fall back to $* for backward compatibility, but warn that inner +# double-quotes will have been stripped by the calling shell. +if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then + commands="${VLLM_TEST_COMMANDS}" + echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)" +else + commands="$*" + if [[ -z "$commands" ]]; then + echo "Error: No test commands provided." >&2 + echo "Usage:" >&2 + echo " Preferred: VLLM_TEST_COMMANDS='...' bash $0" >&2 + echo " Legacy: bash $0 \"commands here\"" >&2 + exit 1 + fi + echo "Commands sourced from positional args (legacy mode)" + echo "WARNING: Inner double-quotes in the command string may have been" + echo " stripped by the calling shell. If you see syntax errors, switch to:" + echo " export VLLM_TEST_COMMANDS='your commands here'" + echo " bash $0" fi -#ignore certain Entrypoints/openai tests -if [[ $commands == *" entrypoints/openai "* ]]; then - commands=${commands//" entrypoints/openai "/" entrypoints/openai \ - --ignore=entrypoints/openai/test_audio.py \ - --ignore=entrypoints/openai/test_shutdown.py \ - --ignore=entrypoints/openai/test_completion.py \ - --ignore=entrypoints/openai/test_models.py \ - --ignore=entrypoints/openai/test_lora_adapters.py \ - --ignore=entrypoints/openai/test_return_tokens_as_ids.py \ - --ignore=entrypoints/openai/test_root_path.py \ - --ignore=entrypoints/openai/test_tokenization.py \ - --ignore=entrypoints/openai/test_prompt_validation.py "} -fi +echo "Raw commands: $commands" -#ignore certain Entrypoints/llm tests -if [[ $commands == *" entrypoints/llm "* ]]; then - commands=${commands//" entrypoints/llm "/" entrypoints/llm \ - --ignore=entrypoints/llm/test_chat.py \ - --ignore=entrypoints/llm/test_accuracy.py \ - --ignore=entrypoints/llm/test_init.py \ - --ignore=entrypoints/llm/test_prompt_validation.py "} -fi +# Fix quoting before ROCm overrides (so overrides see correct structure) +commands=$(re_quote_pytest_markers "$commands") +echo "After re-quoting: $commands" -commands=$(echo "$commands" | sed 's/ \\ / /g') +commands=$(apply_rocm_test_overrides "$commands") echo "Final commands: $commands" -# --ignore=entrypoints/openai/test_encoder_decoder.py \ -# --ignore=entrypoints/openai/test_embedding.py \ -# --ignore=entrypoints/openai/test_oot_registration.py -# --ignore=entrypoints/openai/test_accuracy.py \ -# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13 - - MYPYTHONPATH=".." -# Test that we're launching on the machine that has -# proper access to GPUs +# Verify GPU access render_gid=$(getent group render | cut -d: -f3) if [[ -z "$render_gid" ]]; then echo "Error: 'render' group not found. This is required for GPU access." >&2 exit 1 fi -if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then +# --- RDMA device passthrough (conditional) --- +# If the host has RDMA devices, pass them through so tests like +# test_moriio_connector can access ibverbs. On hosts without RDMA +# hardware the tests will gracefully skip via _rdma_available(). +RDMA_FLAGS="" +if [ -d /dev/infiniband ]; then + echo "RDMA devices detected on host, enabling passthrough" + RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK" +else + echo "No RDMA devices found on host, RDMA tests will be skipped" +fi +# --- Route: multi-node vs single-node --- +if is_multi_node "$commands"; then + echo "--- Multi-node job detected" export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/') - if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then - prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g') - echo "PREFIX: ${prefix}" - export composite_command="(command rocm-smi || true)" - myIFS=$IFS - IFS=',' - read -ra node0 <<< ${BASH_REMATCH[2]} - read -ra node1 <<< ${BASH_REMATCH[3]} - IFS=$myIFS - for i in "${!node0[@]}";do - command_node_0=$(echo ${node0[i]} | sed 's/\"//g') - command_node_1=$(echo ${node1[i]} | sed 's/\"//g') - - export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'" - echo "COMMANDS: ${commands}" - composite_command=$(echo "${composite_command} && ${commands}") - done - /bin/bash -c "${composite_command}" - cleanup_network + # Parse the bracket syntax: prefix ; [node0_cmds] && [node1_cmds] + # BASH_REMATCH[1] = prefix (everything before first bracket) + # BASH_REMATCH[2] = comma-separated node0 commands + # BASH_REMATCH[3] = comma-separated node1 commands + if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then + prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g') + echo "PREFIX: ${prefix}" + + export composite_command="(command rocm-smi || true)" + saved_IFS=$IFS + IFS=',' + read -ra node0 <<< "${BASH_REMATCH[2]}" + read -ra node1 <<< "${BASH_REMATCH[3]}" + IFS=$saved_IFS + + if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then + echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index." + fi + + for i in "${!node0[@]}"; do + command_node_0=$(echo "${node0[i]}" | sed 's/\"//g') + command_node_1=$(echo "${node1[i]}" | sed 's/\"//g') + + step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'" + echo "COMMANDS: ${step_cmd}" + composite_command="${composite_command} && ${step_cmd}" + done + + /bin/bash -c "${composite_command}" + exit_code=$? + cleanup_network + handle_pytest_exit "$exit_code" else - echo "Failed to parse node commands! Exiting." - cleanup_network - exit 111 + echo "Multi-node job detected but failed to parse bracket command syntax." + echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]" + echo "Got: $commands" + cleanup_network + exit 111 fi else + echo "--- Single-node job" echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" docker run \ - --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ - --network=host \ - --shm-size=16gb \ - --group-add "$render_gid" \ - --rm \ - -e HF_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -v "${HF_CACHE}:${HF_MOUNT}" \ - -e "HF_HOME=${HF_MOUNT}" \ - -e "PYTHONPATH=${MYPYTHONPATH}" \ - --name "${container_name}" \ - "${image_name}" \ - /bin/bash -c "${commands}" + --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ + $RDMA_FLAGS \ + --network=host \ + --shm-size=16gb \ + --group-add "$render_gid" \ + --rm \ + -e HF_TOKEN \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e BUILDKITE_PARALLEL_JOB \ + -e BUILDKITE_PARALLEL_JOB_COUNT \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + -e "PYTHONPATH=${MYPYTHONPATH}" \ + --name "${container_name}" \ + "${image_name}" \ + /bin/bash -c "${commands}" + + exit_code=$? + handle_pytest_exit "$exit_code" fi diff --git a/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh new file mode 100755 index 0000000000000000000000000000000000000000..232673f01a0b716371ba74d1ef74eca9675effbf --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -euox pipefail + +export VLLM_CPU_KVCACHE_SPACE=1 +export VLLM_CPU_CI_ENV=1 +# Reduce sub-processes for acceleration +export TORCH_COMPILE_DISABLE=1 +export VLLM_ENABLE_V1_MULTIPROCESSING=0 + +SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz" +SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217" +wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}" +echo "${SDE_CHECKSUM} ${SDE_ARCHIVE}" | sha256sum --check +mkdir -p sde +tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/ + +wait_for_pid_and_check_log() { + local pid="$1" + local log_file="$2" + local exit_status + + if [ -z "$pid" ] || [ -z "$log_file" ]; then + echo "Usage: wait_for_pid_and_check_log " + return 1 + fi + + echo "Waiting for process $pid to finish..." + + # Use the 'wait' command to pause the script until the specific PID exits. + # The 'wait' command's own exit status will be that of the waited-for process. + if wait "$pid"; then + exit_status=$? + echo "Process $pid finished with exit status $exit_status (Success)." + else + exit_status=$? + echo "Process $pid finished with exit status $exit_status (Failure)." + fi + + if [ "$exit_status" -ne 0 ]; then + echo "Process exited with a non-zero status." + echo "--- Last few lines of log file: $log_file ---" + tail -n 50 "$log_file" + echo "---------------------------------------------" + return 1 # Indicate failure based on exit status + fi + + echo "No errors detected in log file and process exited successfully." + return 0 +} + +# Test Sky Lake (AVX512F) +./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 & +PID_TEST_0=$! + +# Test Cascade Lake (AVX512F + VNNI) +./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 & +PID_TEST_1=$! + +# Test Cooper Lake (AVX512F + VNNI + BF16) +./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 & +PID_TEST_2=$! + +wait_for_pid_and_check_log $PID_TEST_0 test_0.log +wait_for_pid_and_check_log $PID_TEST_1 test_1.log +wait_for_pid_and_check_log $PID_TEST_2 test_2.log diff --git a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh index 3caa49832c3f479b0b5ff071bba8c803c01c1c47..f289a43c6be4eca53d43a8c31b5936ebfafcf536 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh @@ -1,26 +1,43 @@ #!/bin/bash set -euox pipefail +export VLLM_CPU_CI_ENV=0 echo "--- PP+TP" vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & server_pid=$! -timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 +timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1 vllm bench serve \ --backend vllm \ --dataset-name random \ --model meta-llama/Llama-3.2-3B-Instruct \ --num-prompts 20 \ + --result-dir ./test_results \ + --result-filename tp_pp.json \ + --save-result \ --endpoint /v1/completions -kill -s SIGTERM $server_pid & +kill -s SIGTERM $server_pid; wait $server_pid || true +failed_req=$(jq '.failed' ./test_results/tp_pp.json) +if [ "$failed_req" -ne 0 ]; then + echo "Some requests were failed!" + exit 1 +fi echo "--- DP+TP" vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 & server_pid=$! -timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 +timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1 vllm bench serve \ --backend vllm \ --dataset-name random \ --model meta-llama/Llama-3.2-3B-Instruct \ --num-prompts 20 \ + --result-dir ./test_results \ + --result-filename dp_pp.json \ + --save-result \ --endpoint /v1/completions -kill -s SIGTERM $server_pid & +kill -s SIGTERM $server_pid; wait $server_pid || true +failed_req=$(jq '.failed' ./test_results/dp_pp.json) +if [ "$failed_req" -ne 0 ]; then + echo "Some requests were failed!" + exit 1 +fi diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh index b6274d698d01ae314048026a8803ba8f2bdfd7ca..528385d505ff4ef4259868cf1aab6ca1028701ed 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -34,7 +34,7 @@ function cpu_tests() { # offline inference docker exec cpu-test bash -c " set -e - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" # Run model tests docker exec cpu-test bash -c " diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh index 3728f73fa2a362e4592240cb31055d0396f0e172..e82baed0517bd19940a641938da583dda7b52ecc 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh @@ -27,7 +27,7 @@ function cpu_tests() { podman exec -it "$container_id" bash -c " export TORCH_COMPILE_DISABLE=1 set -xve - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log # Run basic model test podman exec -it "$container_id" bash -c " @@ -43,7 +43,7 @@ function cpu_tests() { pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. - # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log + # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log } # All of CPU tests are expected to be finished less than 40 mins. diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index c32b051cabc18552940f63e9674ec413ce076752..db75ad3083b2402e21280f1516dccc136c1e1652 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image" docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \ - timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}" +docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \ + timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}" diff --git a/.buildkite/scripts/hardware_ci/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh index f69e4b06680f57da5ab5c5885c581c7c2cbe3d95..06e0f7af87cad262171a92af6e2a2e593007c506 100644 --- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh +++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh @@ -25,5 +25,5 @@ remove_docker_container # Run the image and test offline inference docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B + python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B ' diff --git a/.buildkite/scripts/hardware_ci/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh index 7df696eb29fcb84bcf642ca6ee76d25424e6df66..10df07b2000f5168e55e4a57c5a1cbe837263ce6 100644 --- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh @@ -1,17 +1,42 @@ #!/bin/bash -# This script build the CPU docker image and run the offline inference inside the container. +# This script builds the HPU docker image and runs the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. +# +# vllm-gaudi compatibility pinning: +# The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job. +# When upstream vllm changes its API, the plugin may break before it has been updated. +# To handle this, the vllm-gaudi repository maintains a file: +# vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT +# The first line of that file controls what version of vllm is used inside the Docker image: +# - "latest" : no checkout override; the current Buildkite CI commit is used as-is. +# - "" : vllm is checked out to that specific commit before building, pinning +# the test to a known-compatible baseline. +# To unpin (resume testing against the live vllm tip), set the file content back to "latest". set -exuo pipefail +# Fetch the vllm community commit reference from vllm-gaudi (first line only). +VLLM_COMMUNITY_COMMIT=$(curl -s \ + https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \ + | head -1 | tr -d '\n') + +echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}" + # Try building the docker image image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}" container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container" -cat </dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \ + fi + WORKDIR /workspace/vllm ENV no_proxy=localhost,127.0.0.1 @@ -39,19 +64,19 @@ EOF # functions, while other platforms only need one remove_docker_container # function. EXITCODE=1 -remove_docker_containers() { docker rm -f ${container_name} || true; } +remove_docker_containers() { docker rm -f "${container_name}" || true; } trap 'remove_docker_containers; exit $EXITCODE;' EXIT remove_docker_containers echo "Running HPU plugin v1 test" -docker run --rm --runtime=habana --name=${container_name} --network=host \ +docker run --rm --runtime=habana --name="${container_name}" --network=host \ -e HABANA_VISIBLE_DEVICES=all \ -e VLLM_SKIP_WARMUP=true \ -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \ -e PT_HPU_LAZY_MODE=1 \ "${image_name}" \ /bin/bash -c ' - cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m + cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m ' EXITCODE=$? diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh index 0db1abe37ba11f3118f76968f956fe5f76e1a089..9d33a8c0b2270c56a74ed6f1ebde4f7ba1351345 100644 --- a/.buildkite/scripts/hardware_ci/run-npu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh @@ -41,6 +41,7 @@ get_config() { echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2 exit 1 fi + # shellcheck source=/dev/null source "${TEST_RUN_CONFIG_FILE}" echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}" return 0 @@ -48,9 +49,8 @@ get_config() { # get test running configuration. fetch_vllm_test_cfg -get_config # Check if the function call was successful. If not, exit the script. -if [ $? -ne 0 ]; then +if ! get_config; then exit 1 fi @@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}') echo "agent_idx: ${agent_idx}" builder_name="cachebuilder${agent_idx}" builder_cache_dir="/mnt/docker-cache${agent_idx}" -mkdir -p ${builder_cache_dir} +mkdir -p "${builder_cache_dir}" # Try building the docker image cat <=0.4.9.2" \ + && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index eafc82b98439be027a28b4be8b9fc4899badbf5e..feaf2b3562675005446c377f002dcedea828161b 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR" echo "--- Installing Python dependencies ---" python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ - && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \ + && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index b52dd7826e5444c220196aa4af753a597313bd5e..be7886354392b192e397026fb63d760b714a0993 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}" container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" # Try building the docker image -docker build -t ${image_name} -f docker/Dockerfile.xpu . +docker build -t "${image_name}" -f docker/Dockerfile.xpu . # Setup cleanup remove_docker_container() { @@ -34,17 +34,17 @@ docker run \ set -e echo $ZE_AFFINITY_MASK pip install tblib==3.1.0 - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 - python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager - python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 - python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN + python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 + python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager + python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 + python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel cd tests - pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py + pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py pytest -v -s v1/engine pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py diff --git a/.buildkite/scripts/push-nightly-builds.sh b/.buildkite/scripts/push-nightly-builds.sh index 98e80fd99ec4841810f3332db475de6c451b6a70..20c372a950dfd92b1d684c68468f4ce5d7d71cbf 100755 --- a/.buildkite/scripts/push-nightly-builds.sh +++ b/.buildkite/scripts/push-nightly-builds.sh @@ -21,16 +21,16 @@ echo "Pushing original tag $ORIG_TAG_NAME$ORIG_TAG_SUFFIX to new nightly tag nam # pull original arch-dependent images from AWS ECR Public aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" # tag arch-dependent images -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-x86_64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-x86_64 -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$ORIG_TAG_NAME-aarch64$ORIG_TAG_SUFFIX vllm/vllm-openai:$TAG_NAME-aarch64 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-x86_64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-x86_64 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG_NAME"-aarch64"$ORIG_TAG_SUFFIX" vllm/vllm-openai:"$TAG_NAME"-aarch64 # push arch-dependent images to DockerHub -docker push vllm/vllm-openai:$TAG_NAME-x86_64 -docker push vllm/vllm-openai:$TAG_NAME-aarch64 +docker push vllm/vllm-openai:"$TAG_NAME"-x86_64 +docker push vllm/vllm-openai:"$TAG_NAME"-aarch64 # push arch-independent manifest to DockerHub -docker manifest create vllm/vllm-openai:$TAG_NAME vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend -docker manifest create vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT vllm/vllm-openai:$TAG_NAME-x86_64 vllm/vllm-openai:$TAG_NAME-aarch64 --amend -docker manifest push vllm/vllm-openai:$TAG_NAME -docker manifest push vllm/vllm-openai:$TAG_NAME-$BUILDKITE_COMMIT +docker manifest create vllm/vllm-openai:"$TAG_NAME" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend +docker manifest create vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" vllm/vllm-openai:"$TAG_NAME"-x86_64 vllm/vllm-openai:"$TAG_NAME"-aarch64 --amend +docker manifest push vllm/vllm-openai:"$TAG_NAME" +docker manifest push vllm/vllm-openai:"$TAG_NAME"-"$BUILDKITE_COMMIT" diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh deleted file mode 100755 index 3fb7c82c8d333ee715b74e87ff66d2a20fa3efd8..0000000000000000000000000000000000000000 --- a/.buildkite/scripts/run-prime-rl-test.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Setup script for Prime-RL integration tests -# This script prepares the environment for running Prime-RL tests with nightly vLLM - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" -PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git" -PRIME_RL_DIR="${REPO_ROOT}/prime-rl" - -if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then - echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..." - exit 0 -fi - -echo "Setting up Prime-RL integration test environment..." - -# Clean up any existing Prime-RL directory -if [ -d "${PRIME_RL_DIR}" ]; then - echo "Removing existing Prime-RL directory..." - rm -rf "${PRIME_RL_DIR}" -fi - -# Install UV if not available -if ! command -v uv &> /dev/null; then - echo "Installing UV package manager..." - curl -LsSf https://astral.sh/uv/install.sh | sh - source $HOME/.local/bin/env -fi - -# Clone Prime-RL repository at specific branch for reproducible tests -PRIME_RL_BRANCH="integ-vllm-main" -echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..." -git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}" -cd "${PRIME_RL_DIR}" - -echo "Setting up UV project environment..." -export UV_PROJECT_ENVIRONMENT=/usr/local -ln -s /usr/bin/python3 /usr/local/bin/python - -# Remove vllm pin from pyproject.toml -echo "Removing vllm pin from pyproject.toml..." -sed -i '/vllm==/d' pyproject.toml - -# Sync Prime-RL dependencies -echo "Installing Prime-RL dependencies..." -uv sync --inexact && uv sync --inexact --all-extras - -# Verify installation -echo "Verifying installations..." -uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" -uv run python -c "import prime_rl; print('Prime-RL imported successfully')" - -echo "Prime-RL integration test environment setup complete!" - -echo "Running Prime-RL integration tests..." -export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY -uv run pytest -vs tests/integration/test_rl.py -m gpu - -echo "Prime-RL integration tests completed!" diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh index 463969cbc2acdbd8f97950c72b186b1994ae9eb0..e26273bba39a454fc955f903ef14bc650aa3fe27 100644 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -51,14 +51,14 @@ for BACK in "${BACKENDS[@]}"; do --enable-eplb \ --trust-remote-code \ --max-model-len 2048 \ - --all2all-backend $BACK \ - --port $PORT & + --all2all-backend "$BACK" \ + --port "$PORT" & SERVER_PID=$! - wait_for_server $PORT + wait_for_server "$PORT" TAG=$(echo "$MODEL" | tr '/: \\n' '_____') OUT="${OUT_DIR}/${TAG}_${BACK}.json" - python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} + python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}" python3 - < /dev/null; do + sleep 1 + done' +} + +MODEL="deepseek-ai/DeepSeek-V2-Lite" + +cleanup() { + if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then + kill "${SERVER_PID}" 2>/dev/null || true + for _ in {1..20}; do + kill -0 "${SERVER_PID}" 2>/dev/null || break + sleep 0.5 + done + kill -9 "${SERVER_PID}" 2>/dev/null || true + fi +} +trap cleanup EXIT + +vllm serve "$MODEL" \ + --max-model-len 2048 \ + --offload-group-size 8 \ + --offload-num-in-group 2 \ + --offload-prefetch-step 1 \ + --offload-params w13_weight w2_weight \ + --port "$PORT" & +SERVER_PID=$! +wait_for_server "$PORT" + +TAG=$(echo "$MODEL" | tr '/: \\n' '_____') +OUT="${OUT_DIR}/${TAG}_prefetch_offload.json" +python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}" +python3 - <= ${THRESHOLD}, f"${MODEL} prefetch_offload accuracy {acc}" +PY + +cleanup +SERVER_PID= diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh index d0921c5699d5d202bd0fed73e3ac0bb14860d4f0..729a0fb7f6882b146d0b9189b965df4663132c31 100644 --- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh @@ -47,20 +47,20 @@ for BACK in "${BACKENDS[@]}"; do vllm serve "$MODEL" \ --enforce-eager \ --enable-eplb \ - --all2all-backend $BACK \ + --all2all-backend "$BACK" \ --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \ - --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ - --data-parallel-size ${DATA_PARALLEL_SIZE} \ + --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \ + --data-parallel-size "${DATA_PARALLEL_SIZE}" \ --enable-expert-parallel \ --trust-remote-code \ --max-model-len 2048 \ - --port $PORT & + --port "$PORT" & SERVER_PID=$! - wait_for_server $PORT + wait_for_server "$PORT" TAG=$(echo "$MODEL" | tr '/: \\n' '_____') OUT="${OUT_DIR}/${TAG}_${BACK}.json" - python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} + python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}" python3 - < /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH: BACKENDS=("allgather_reducescatter") # Disable MOE padding for ROCm since it is causing eplb to fail export VLLM_ROCM_MOE_PADDING=0 - PLATFORM_ARGS=("--no-async-scheduling") + PLATFORM_ARGS=("--no-async-scheduling" "--attention-backend=TRITON_ATTN") echo "Disabled async scheduling for ROCm platform due to issues with spec decode." else # Non-ROCm platform (CUDA/other) @@ -51,20 +51,20 @@ for BACK in "${BACKENDS[@]}"; do --tensor-parallel-size 4 \ --enable-expert-parallel \ --enable-eplb \ - --all2all-backend $BACK \ + --all2all-backend "$BACK" \ --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ --trust-remote-code \ --max-model-len 2048 \ --gpu-memory-utilization 0.9 \ "${PLATFORM_ARGS[@]}" \ - --port $PORT & + --port "$PORT" & SERVER_PID=$! - wait_for_server $PORT + wait_for_server "$PORT" TAG=$(echo "$MODEL" | tr '/: \\n' '_____') OUT="${OUT_DIR}/${TAG}_${BACK}.json" - python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} + python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port "$PORT" --num-questions "${NUM_Q}" --save-results "${OUT}" python3 - <}" +echo "Test category: $TEST_CATEGORY" +echo "TP size: $TP_SIZE" +echo "Max model len: $MAX_MODEL_LEN" +echo "Port: $PORT" +echo "Num threads: $NUM_THREADS" +echo "============================================" + +# ---- Install bfcl-eval if missing ---- +if ! python3 -c "import bfcl_eval" 2>/dev/null; then + echo "Installing bfcl-eval..." + pip install "bfcl-eval>=2025.10.20.1,<2026" +fi + +# ---- Cleanup handler ---- +SERVER_PID="" +cleanup() { + if [ -n "$SERVER_PID" ]; then + echo "Stopping vLLM server (pid=$SERVER_PID)..." + kill "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + fi + # Remove BFCL lock files (created by filelock for thread-safe writes) + rm -rf .file_locks/ + if [ -n "${OUTPUT_DIR:-}" ]; then + rm -rf "$OUTPUT_DIR/.file_locks/" + fi +} +trap cleanup EXIT + +# ---- Start vLLM server ---- +echo "Starting vLLM server..." + +SERVE_ARGS=( + "$MODEL" + --port "$PORT" + --enable-auto-tool-choice + --tool-call-parser "$TOOL_CALL_PARSER" + --tensor-parallel-size "$TP_SIZE" + --max-model-len "$MAX_MODEL_LEN" + --enforce-eager + --no-enable-prefix-caching +) + +# Append reasoning parser if specified +if [ -n "$REASONING_PARSER" ]; then + SERVE_ARGS+=(--reasoning-parser "$REASONING_PARSER") +fi + +# Append any extra args +if [ -n "$EXTRA_ARGS" ]; then + read -ra EXTRA_ARGS_ARRAY <<< "$EXTRA_ARGS" + SERVE_ARGS+=("${EXTRA_ARGS_ARRAY[@]}") +fi + +echo "Command: vllm serve ${SERVE_ARGS[*]}" +vllm serve "${SERVE_ARGS[@]}" & +SERVER_PID=$! + +# ---- Wait for server to be ready ---- +echo "Waiting for vLLM server to start (timeout: 600s)..." +SECONDS_WAITED=0 +until curl -sf "http://localhost:${PORT}/health" > /dev/null 2>&1; do + if [ $SECONDS_WAITED -ge 600 ]; then + echo "" + echo "ERROR: vLLM server failed to start within 600s" + exit 1 + fi + if (( SECONDS_WAITED % 30 == 0 && SECONDS_WAITED > 0 )); then + echo " Still waiting... (${SECONDS_WAITED}s elapsed)" + fi + sleep 2 + SECONDS_WAITED=$((SECONDS_WAITED + 2)) +done +echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)" + +# ---- Run BFCL evaluation ---- +# bfcl-eval has no CLI entry point; generate() and evaluate() are Typer +# functions that must be called from Python. The MODEL_CONFIG_MAPPING must +# be patched in-process so BFCL knows to use the OpenAI-compatible handler +# against our local vLLM server. +bfcl_exit_code=0 +python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$? +import os +import sys + +model = sys.argv[1] +test_category = sys.argv[2] +num_threads = int(sys.argv[3]) +port = sys.argv[4] +api_type = sys.argv[5] +output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd() + +os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1" +os.environ["OPENAI_API_KEY"] = "dummy" +os.environ["BFCL_PROJECT_ROOT"] = output_dir + +import bfcl_eval.constants.model_config as bfcl_model_config +from bfcl_eval.constants.model_config import ModelConfig +from bfcl_eval.model_handler.api_inference.openai_completion import ( + OpenAICompletionsHandler, +) +from bfcl_eval.model_handler.api_inference.openai_response import ( + OpenAIResponsesHandler, +) + +if api_type == "responses": + handler = OpenAIResponsesHandler +else: + handler = OpenAICompletionsHandler + +bfcl_model_config.MODEL_CONFIG_MAPPING[model] = ModelConfig( + model_name=model, + display_name=f"{model} (FC) (vLLM)", + url=f"https://huggingface.co/{model}", + org="", + license="apache-2.0", + model_handler=handler, + input_price=None, + output_price=None, + is_fc_model=True, + underscore_to_dot=True, +) + +from bfcl_eval.__main__ import evaluate, generate +import inspect +import typer + + +def _get_default_kwargs(function): + kwargs = {} + for k, v in inspect.signature(function).parameters.items(): + if v.default is not inspect.Parameter.empty: + default = v.default + if isinstance(default, typer.models.OptionInfo): + default = default.default + kwargs[k] = default + return kwargs + + +# ---- generate ---- +print(f"=== BFCL generate: model={model} test_category={test_category} ===") +gen_kwargs = _get_default_kwargs(generate) +gen_kwargs["model"] = [model] +gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")] +gen_kwargs["skip_server_setup"] = True +gen_kwargs["num_threads"] = num_threads +generate(**gen_kwargs) + +# ---- evaluate ---- +print(f"=== BFCL evaluate: model={model} test_category={test_category} ===") +eval_kwargs = _get_default_kwargs(evaluate) +eval_kwargs["model"] = [model] +eval_kwargs["test_category"] = [c.strip() for c in test_category.split(",")] +evaluate(**eval_kwargs) + +print("=== BFCL evaluation completed successfully ===") +PYEOF + +# ---- Upload results to buildkite ---- +if command -v buildkite-agent &>/dev/null; then + if [ $bfcl_exit_code -eq 0 ]; then + STYLE="success" + STATUS="PASSED" + else + STYLE="error" + STATUS="FAILED" + fi + + buildkite-agent annotate --style "$STYLE" --context "bfcl-results" < "$VLLM_LOG" 2>&1 & + --download_dir "$DOWNLOAD_DIR" \ + --max-model-len "$MAX_MODEL_LEN" > "$VLLM_LOG" 2>&1 & echo "wait for 20 minutes.." echo # sleep 1200 # wait for 10 minutes... -for i in {1..120}; do +for _ in {1..120}; do # TODO: detect other type of errors. if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then echo "Detected RuntimeError, exiting." @@ -78,11 +78,11 @@ echo "logging to $BM_LOG" echo vllm bench serve \ --backend vllm \ - --model $MODEL \ + --model "$MODEL" \ --dataset-name sonnet \ --dataset-path benchmarks/sonnet_4x.txt \ - --sonnet-input-len $INPUT_LEN \ - --sonnet-output-len $OUTPUT_LEN \ + --sonnet-input-len "$INPUT_LEN" \ + --sonnet-output-len "$OUTPUT_LEN" \ --ignore-eos > "$BM_LOG" echo "completed..." diff --git a/.buildkite/scripts/upload-nightly-wheels.sh b/.buildkite/scripts/upload-nightly-wheels.sh index 1af7f476ae74b725aa3c969256a49d5ebca0b411..071939df9ca6348c24c1a929bd0a2acf97a560dc 100644 --- a/.buildkite/scripts/upload-nightly-wheels.sh +++ b/.buildkite/scripts/upload-nightly-wheels.sh @@ -72,20 +72,19 @@ obj_json="objects.json" aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json" mkdir -p "$INDICES_OUTPUT_DIR" -# call script to generate indicies for all existing wheels +# call script to generate indices for all existing wheels # this indices have relative paths that could work as long as it is next to the wheel directory in s3 # i.e., the wheels are always in s3://vllm-wheels// # and indices can be placed in //, or /nightly/, or // -if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then - alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS" -else - alias_arg="" +alias_args=() +if [[ -n "$DEFAULT_VARIANT_ALIAS" ]]; then + alias_args=(--alias-to-default "$DEFAULT_VARIANT_ALIAS") fi # HACK: we do not need regex module here, but it is required by pre-commit hook # To avoid any external dependency, we simply replace it back to the stdlib re module sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py -$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg +$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" "${alias_args[@]}" # copy indices to // unconditionally echo "Uploading indices to $S3_COMMIT_PREFIX" @@ -100,9 +99,9 @@ fi # re-generate and copy to // only if it does not have "dev" in the version if [[ "$version" != *"dev"* ]]; then echo "Re-generating indices for /$pure_version/" - rm -rf "$INDICES_OUTPUT_DIR/*" + rm -rf "${INDICES_OUTPUT_DIR:?}/*" mkdir -p "$INDICES_OUTPUT_DIR" # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path - $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg + $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" "${alias_args[@]}" aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" fi diff --git a/.buildkite/scripts/upload-release-wheels-pypi.sh b/.buildkite/scripts/upload-release-wheels-pypi.sh index 75f519168c5f0f3d69bf45027879406ff7a80aa8..058e5bbe4f4c533c13aed56b58e96e0689fdca1e 100644 --- a/.buildkite/scripts/upload-release-wheels-pypi.sh +++ b/.buildkite/scripts/upload-release-wheels-pypi.sh @@ -7,7 +7,7 @@ SUBPATH=$BUILDKITE_COMMIT S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/" RELEASE_VERSION=$(buildkite-agent meta-data get release-version) -GIT_VERSION=$(git describe --exact-match --tags $BUILDKITE_COMMIT 2>/dev/null) +GIT_VERSION=$(git describe --exact-match --tags "$BUILDKITE_COMMIT" 2>/dev/null) echo "Release version from Buildkite: $RELEASE_VERSION" @@ -54,10 +54,13 @@ mkdir -p $DIST_DIR # include only wheels for the release version, ignore all files with "dev" or "rc" in the name (without excluding 'aarch64') aws s3 cp --recursive --exclude "*" --include "vllm-${PURE_VERSION}*.whl" --exclude "*dev*" --exclude "*rc[0-9]*" "$S3_COMMIT_PREFIX" $DIST_DIR echo "Wheels copied to local directory" -# generate source tarball -git archive --format=tar.gz --output="$DIST_DIR/vllm-${PURE_VERSION}.tar.gz" $BUILDKITE_COMMIT +# generate source distribution using setup.py +python setup.py sdist --dist-dir=$DIST_DIR ls -la $DIST_DIR +SDIST_FILE=$(find $DIST_DIR -name "vllm*.tar.gz") +echo "Found sdist: $SDIST_FILE" + # upload wheels to PyPI (only default variant, i.e. files without '+' in the name) PYPI_WHEEL_FILES=$(find $DIST_DIR -name "vllm-${PURE_VERSION}*.whl" -not -name "*+*") if [[ -z "$PYPI_WHEEL_FILES" ]]; then @@ -65,6 +68,6 @@ if [[ -z "$PYPI_WHEEL_FILES" ]]; then exit 1 fi -python3 -m twine check $PYPI_WHEEL_FILES -python3 -m twine upload --non-interactive --verbose $PYPI_WHEEL_FILES -echo "Wheels uploaded to PyPI" +python3 -m twine check "$PYPI_WHEEL_FILES" "$SDIST_FILE" +python3 -m twine upload --non-interactive --verbose "$PYPI_WHEEL_FILES" "$SDIST_FILE" +echo "Wheels and source distribution uploaded to PyPI" diff --git a/.buildkite/scripts/upload-rocm-wheels.sh b/.buildkite/scripts/upload-rocm-wheels.sh index bb555bc842925c13bd05f7aa44ebb8f4dabbf194..a42848a16ffe64fdb9da1bccf10fed7ba51201bf 100755 --- a/.buildkite/scripts/upload-rocm-wheels.sh +++ b/.buildkite/scripts/upload-rocm-wheels.sh @@ -55,7 +55,7 @@ mkdir -p all-rocm-wheels cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true -WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l) +WHEEL_COUNT=$(find all-rocm-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l) echo "Total wheels to upload: $WHEEL_COUNT" if [ "$WHEEL_COUNT" -eq 0 ]; then @@ -115,7 +115,7 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] | fi # Extract version from vLLM wheel and update version-specific index -VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1) +VLLM_WHEEL=$(find all-rocm-wheels -maxdepth 1 -name 'vllm*.whl' 2>/dev/null | head -1) if [ -n "$VLLM_WHEEL" ]; then VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) echo "Version in wheel: $VERSION" diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index e78cdd7f8333d4a2159f79511af906a74c9b6b9b..7f8020540ab19801e1d993b666af86f53c3a4de4 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -33,15 +33,3107 @@ # Note that all steps execute in parallel. steps: + + +##################################################################################################################################### +# # +# MI250 test definitions ( currently the test set is completely mirrored // TBD which tests are to be routed there ultimately) # +# # +##################################################################################################################################### + +- label: Pytorch Nightly Dependency Override Check # 2min + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - requirements/nightly_torch_test.txt + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh + +- label: Async Engine, Inputs, Utils, Worker Test # 10min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/detokenizer + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s detokenizer + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/test_pooling_params.py + - tests/multimodal + - tests/renderers + - tests/standalone_tests/lazy_imports.py + - tests/tokenizers_ + - tests/tool_parsers + - tests/transformers_utils + - tests/config + no_gpu: true + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s test_pooling_params.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s renderers + - pytest -v -s tokenizers_ + - pytest -v -s tool_parsers + - pytest -v -s transformers_utils + - pytest -v -s config + +- label: Python-only Installation Test # 10min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + +- label: Basic Correctness Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py + +- label: Entrypoints Unit Tests # 5min + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + fast_check: true + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: Entrypoints Integration Test (LLM) # 30min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py + - pytest -v -s entrypoints/offline_mode + +- label: Entrypoints Integration Test (API Server 1) # 100min + timeout_in_minutes: 130 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses + - pytest -v -s entrypoints/test_chat_utils.py + +- label: Entrypoints Integration Test (API Server 2) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/rpc + - tests/entrypoints/instrumentator + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/instrumentator + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s tool_use + +- label: Entrypoints Integration Test (Pooling) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + fast_check: true + torch_nightly: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + +- label: Entrypoints Integration Test (Responses API) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai/responses + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai/responses + +- label: Distributed Tests (4 GPUs) # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_utils + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/fullgraph/test_basic_correctness.py + - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - examples/offline_inference/new_weight_syncing/ + - tests/examples/offline_inference/data_parallel.py + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + - pushd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd + - pushd ../examples/offline_inference/new_weight_syncing + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py + - popd + +- label: Distributed Tests (8 GPUs) # 4min + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_8 + optional: true + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py + - vllm/distributed/ + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + +- label: EPLB Algorithm Test # 5min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdtentative, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + +- label: EPLB Execution Test # 10min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py + +- label: Metrics, Tracing Test # 12min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/v1/tracing + commands: + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" + - pytest -v -s v1/tracing + +- label: Regression Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + +- label: Engine Test # 9min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/engine + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + +- label: V1 Test e2e + engine # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/e2e + - pytest -v -s v1/engine + +- label: V1 Test e2e (2 GPUs) # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + optional: true + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" + +- label: V1 Test e2e (4 GPUs) # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" + +- label: V1 Test entrypoints # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: V1 Test others # 42min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: V1 Test attention (H100) # 10min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/config/attention.py + - vllm/model_executor/layers/attention + - vllm/v1/attention + - tests/v1/attention + commands: + - pytest -v -s v1/attention + +- label: Batch Invariance Tests (H100) # 10min + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/v1/attention + - vllm/model_executor/layers + - tests/v1/determinism/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py + +- label: V1 Test others (CPU) # 5 mins + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + no_gpu: true + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + + +- label: Examples Test # 30min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - vllm/multimodal + - examples/ + commands: + - pip install tensorizer + - python3 offline_inference/basic/chat.py + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + - python3 pooling/embed/vision_embedding_offline.py --seed 0 + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + +- label: Platform Tests (CUDA) # 4min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + - pytest -v -s cuda/test_platform_no_cuda_init.py + +- label: Samplers Test # 56min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + - tests/conftest.py + commands: + - pytest -v -s samplers + +- label: LoRA Test %N # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + parallelism: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - pytest -v -s lora \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --ignore=lora/test_chatglm3_tp.py \ + --ignore=lora/test_llama_tp.py \ + --ignore=lora/test_llm_with_multi_loras.py \ + --ignore=lora/test_olmoe_tp.py \ + --ignore=lora/test_deepseekv2_tp.py \ + --ignore=lora/test_gptoss_tp.py \ + --ignore=lora/test_qwen3moe_tp.py + +- label: PyTorch Compilation Unit Tests # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + +- label: PyTorch Compilation Passes Unit Tests + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/compile/passes + commands: + - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + +- label: PyTorch Fullgraph Smoke Test # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" + +- label: PyTorch Fullgraph Test # 27min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + +- label: Cudagraph test # 15min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - tests/v1/cudagraph + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - vllm/compilation + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py + +- label: Kernels Core Operation Test # 48min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - csrc/ + - tests/kernels/core + - tests/kernels/test_top_k_per_row.py + commands: + - pytest -v -s kernels/core kernels/test_top_k_per_row.py + +- label: Kernels Attention Test %N # 23min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + parallelism: 2 + source_file_dependencies: + - csrc/attention/ + - vllm/v1/attention + - vllm/model_executor/layers/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + +- label: Kernels Quantization Test %N # 64min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + parallelism: 2 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + +- label: Kernels MoE Test %N # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + parallelism: 2 + source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + +- label: Kernels Mamba Test # 31min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops + commands: + - pytest -v -s kernels/mamba + +- label: Kernels Helion Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/utils/import_utils.py + - tests/kernels/helion/ + commands: + - pip install helion + - pytest -v -s kernels/helion/ + +- label: Model Executor Test # 23min + timeout_in_minutes: 35 + torch_nightly: true + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/engine/arg_utils.py + - vllm/config/model.py + - vllm/model_executor + - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py + +- label: Benchmarks # 11min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + commands: + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ + +- label: Quantization Test # 70min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + commands: + - uv pip install --system torchao==0.14.1 + - uv pip install --system conch-triton-kernels + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py + +- label: LM Eval Small Models # 53min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + autorun_on_main: true + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt + +- label: OpenAI API correctness # 10min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + - tools/ + commands: + - bash ../tools/install_torchcodec_rocm.sh || exit 1 + - pytest -s entrypoints/openai/correctness/ + +- label: Basic Models Tests (Initialization) # 15min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_initialization.py + commands: + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Extra Initialization) %N # 15min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + parallelism: 2 + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/transformers_utils/ + - tests/models/test_initialization.py + commands: + - pytest -v -s models/test_initialization.py \ + -k 'not test_can_initialize_small_subset' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + +- label: Basic Models Tests (Other) # 15min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_terratorch.py + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + no_gpu: true + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Language Models Tests (Standard) # 18min + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Language Models Tests (Extra Standard) %N # 27min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + torch_nightly: true + parallelism: 2 + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + - pip freeze | grep -E 'torch' + - export TORCH_NCCL_BLOCKING_WAIT=1 + - pytest -v -s models/language -m 'core_model and slow_test' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + +- label: Language Models Tests (Hybrid) %N # 50min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + torch_nightly: true + parallelism: 2 + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation \ + -m hybrid_model \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + +- label: Language Models Test (PPL) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + +- label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + +- label: Language Models Test (MTEB) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test + +- label: Multi-Modal Processor Test (CPU) # 15min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + no_gpu: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + - tests/models/registry.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + +- label: Multi-Modal Processor Test # 44min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + - tests/models/registry.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + +- label: Multi-Modal Models Test (Standard) # 60min + timeout_in_minutes: 100 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py + - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Accuracy Eval (Small Models) # 5min + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt + +- label: Multi-Modal Models Test (Extended) 1 # 60min + timeout_in_minutes: 120 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing + +- label: Multi-Modal Models Test (Extended) 2 #60min + timeout_in_minutes: 120 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models Test (Extended) 3 # 75min + timeout_in_minutes: 150 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + +- label: Quantized Models Test # 45 min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + commands: + - pytest -v -s models/quantization + +- label: Transformers Nightly Models Test # 60 min + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + working_dir: "/vllm-workspace/" + optional: true + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' + - python3 examples/offline_inference/basic/chat.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper + +- label: Distributed Comm Ops Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + +- label: 2 Node Tests (4 GPUs in total) # 16min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdmultinode, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 2 + num_nodes: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - tests/examples/offline_inference/data_parallel.py + commands: + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | grep 'Same node test passed' | grep 'Node count test passed' + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code + +- label: Distributed Tests (2 GPUs) # 68min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/fullgraph/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/distributed/ + - tests/entrypoints/llm/test_collective_rpc.py + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + - examples/offline_inference/new_weight_syncing/ + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Model Tests (2 GPUs) # 37min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' + +- label: Plugin Tests (2 GPUs) # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + num_gpus: 2 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # test bge_m3_sparse io_processor plugin + - pip install -e ./plugins/bge_m3_sparse_plugin + - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py + - pip uninstall bge_m3_sparse_plugin -y + # end io_processor plugins test + # begin stat_logger plugins test + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + # end stat_logger plugins test + # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py + - pytest -v -s models/test_oot_registration.py + - pytest -v -s plugins/lora_resolvers + +- label: Pipeline + Context Parallelism Test # 45min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py + +- label: LoRA TP Test (Distributed) # 17 min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + num_gpus: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + +- label: Weight Loading Multiple GPU Test # 33min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt + +- label: Weight Loading Multiple GPU Test - Large Models # optional + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt + +- label: NixlConnector PD accuracy tests (Distributed) # 30min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + num_gpus: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +- label: Distributed Tests (A100) # 68min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + - export TORCH_NCCL_BLOCKING_WAIT=1 + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + +- label: LM Eval Large Models # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +- label: LM Eval Large Models (H100) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=4 + +- label: Distributed Tests (H200) # 68min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_2 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace/" + commands: + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py + - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py + - pytest -v -s tests/distributed/test_context_parallel.py + - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization + +- label: LM Eval Small Models (1 Card) # 15min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_1 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt + +- label: LM Eval Large Models (4 Card) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +- label: ROCm LM Eval Large Models (8 Card) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_8 + num_gpus: 8 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8 + +- label: ROCm GPT-OSS Eval # 80min + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + agent_pool: mi250_1 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + optional: true + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: DeepSeek V2-Lite Accuracy # 70min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy # 70min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + agent_pool: mi250_4 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + + +################################################### +# # +# MI325 test definitions # +# # +################################################### + + +##### fast check tests ##### + +- label: Pytorch Nightly Dependency Override Check # 2min + # if this test fails, it means the nightly torch version is not compatible with some + # of the dependencies. Please check the error message and add the package to whitelist + # in /vllm/tools/pre_commit/generate_nightly_torch_test.py + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + agent_pool: mi325_1 + grade: Blocking + optional: true + soft_fail: true + source_file_dependencies: + - requirements/nightly_torch_test.txt + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh + +- label: Async Engine, Inputs, Utils, Worker Test # 10min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + agent_pool: mi325_1 + grade: Blocking + source_file_dependencies: + - vllm/ + - tests/detokenizer + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s detokenizer + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + agent_pool: mi325_1 + optional: true + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/test_pooling_params.py + - tests/multimodal + - tests/renderers + - tests/standalone_tests/lazy_imports.py + - tests/tokenizers_ + - tests/tool_parsers + - tests/transformers_utils + - tests/config + no_gpu: true + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s test_pooling_params.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s renderers + - pytest -v -s tokenizers_ + - pytest -v -s tool_parsers + - pytest -v -s transformers_utils + - pytest -v -s config + +- label: Python-only Installation Test # 10min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + optional: true + # grade: Blocking + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + +- label: Basic Correctness Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py + +- label: Entrypoints Unit Tests # 5min + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + agent_pool: mi325_1 + grade: Blocking + timeout_in_minutes: 10 + working_dir: "/vllm-workspace/tests" + fast_check: true + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: Entrypoints Integration Test (LLM) # 30min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + +- label: Entrypoints Integration Test (API Server 1) # 100min + timeout_in_minutes: 130 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses + - pytest -v -s entrypoints/test_chat_utils.py + +- label: Entrypoints Integration Test (API Server 2) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/rpc + - tests/entrypoints/instrumentator + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/instrumentator + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s tool_use + +- label: Entrypoints Integration Test (Pooling) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + +- label: Entrypoints Integration Test (Responses API) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai/responses + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai/responses + +- label: Distributed Tests (4 GPUs) # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_utils + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/fullgraph/test_basic_correctness.py + - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - examples/offline_inference/new_weight_syncing/ + - tests/examples/offline_inference/data_parallel.py + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py + commands: + # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 + # TODO: Remove when the bug is fixed in a future ROCm release + - export TORCH_NCCL_BLOCKING_WAIT=1 + # test with torchrun tp=2 and external_dp=2 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=2 and pp=2 + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=4 and dp=1 + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2, pp=2 and dp=1 + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=1 and dp=4 with ep + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2 and dp=2 with ep + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with internal dp + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + # OLD rlhf examples + - pushd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd + # NEW rlhf examples + - pushd ../examples/offline_inference/new_weight_syncing + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py + - popd + +- label: Distributed Tests (8 GPUs) # 4min + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + optional: true + # grade: Blocking + gpu: h100 + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py + - vllm/distributed/ + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py + commands: + # test with torchrun tp=2 and dp=4 with ep + # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 + # TODO: Remove when the bug is fixed in a future ROCm release + - export TORCH_NCCL_BLOCKING_WAIT=1 + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + +- label: EPLB Algorithm Test # 5min + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + agent_pool: mi325_1 + grade: Blocking + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + +- label: EPLB Execution Test # 10min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py + +- label: Metrics, Tracing Test # 12min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/v1/tracing + commands: + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" + - pytest -v -s v1/tracing + ##### fast check tests ##### +##### 1 GPU test ##### + +- label: Regression Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + agent_pool: mi325_1 + grade: Blocking + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: Engine Test # 9min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/engine + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + +- label: V1 Test e2e + engine # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + - pytest -v -s v1/engine + +- label: V1 Test e2e (2 GPUs) # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + optional: true + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # Only run tests that need exactly 2 GPUs + - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" + +- label: V1 Test e2e (4 GPUs) # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction] + # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. + # See discussion here: https://github.com/vllm-project/vllm/pull/31040 + agent_pool: mi325_4 + optional: true + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # Only run tests that need 4 GPUs + - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" + +- label: V1 Test entrypoints # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + agent_pool: mi325_1 + grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: V1 Test others # 42min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # split the test to avoid interference + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + # Integration test for streaming correctness (requires special branch). + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +# TODO: Add the "V1 Test attention (MI300)" test group + +- label: V1 Test attention (H100) # 10min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + timeout_in_minutes: 30 + gpu: h100 + source_file_dependencies: + - vllm/config/attention.py + - vllm/model_executor/layers/attention + - vllm/v1/attention + - tests/v1/attention + commands: + - pytest -v -s v1/attention + +- label: Batch Invariance Tests (H100) # 10min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + timeout_in_minutes: 25 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - vllm/model_executor/layers + - tests/v1/determinism/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py + +- label: V1 Test others (CPU) # 5 mins + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + agent_pool: mi325_1 + grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/v1 + no_gpu: true + commands: + # split the test to avoid interference + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + + +- label: Examples Test # 30min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - vllm/multimodal + - examples/ + commands: + - pip install tensorizer # for tensorizer test + # for basic + - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 basic/offline_inference/classify.py + - python3 basic/offline_inference/embed.py + - python3 basic/offline_inference/score.py + # for multi-modal models + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + # for pooling models + - python3 pooling/embed/vision_embedding_offline.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + #- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + +- label: Platform Tests (CUDA) # 4min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + - pytest -v -s cuda/test_platform_no_cuda_init.py + +- label: Samplers Test # 56min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + - tests/conftest.py + commands: + - pytest -v -s samplers + +- label: LoRA Test %N # 20min each + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - pytest -v -s lora \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --ignore=lora/test_chatglm3_tp.py \ + --ignore=lora/test_llama_tp.py \ + --ignore=lora/test_llm_with_multi_loras.py \ + --ignore=lora/test_olmoe_tp.py \ + --ignore=lora/test_deepseekv2_tp.py \ + --ignore=lora/test_gptoss_tp.py \ + --ignore=lora/test_qwen3moe_tp.py + parallelism: 4 + +##### .buildkite/test_areas/pytorch.yaml ##### +# corresponds to .buildkite/test_areas/pytorch.yaml +- label: PyTorch Compilation Unit Tests # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + +# corresponds to .buildkite/test_areas/pytorch.yaml +- label: PyTorch Compilation Passes Unit Tests + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + source_file_dependencies: + - vllm/ + - tests/compile/passes + commands: + # TODO: clean up this comment if not needed. It is used to + # keep track of the tests changes during vLLM IR Ops refactoring. + # Use `find` to launch multiple instances of pytest. + - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + +- label: PyTorch Fullgraph Smoke Test # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" + +- label: PyTorch Fullgraph Test # 27min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + # # Limit to no custom ops to reduce running time + # # Wrap with quotes to escape yaml and avoid starting -k string with a - + # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 + # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. + +- label: Cudagraph test + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + source_file_dependencies: + - tests/v1/cudagraph + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - vllm/compilation + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py + +- label: Kernels Core Operation Test # 48min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - tests/kernels/core + - tests/kernels/test_top_k_per_row.py + commands: + - pytest -v -s kernels/core kernels/test_top_k_per_row.py + +- label: Kernels Attention Test %N # 23min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/attention/ + - vllm/v1/attention + # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) + - vllm/model_executor/layers/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Quantization Test %N # 64min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels MoE Test %N # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels FP8 MoE Test + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + optional: true + commands: + - pytest -v -s kernels/moe/test_deepep_moe.py + +- label: Kernels Mamba Test # 31min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops + commands: + - pytest -v -s kernels/mamba + +- label: Kernels DeepGEMM Test (H100) # Nvidia-centric +# Not replicating for CUTLAS & CuTe + timeout_in_minutes: 45 + gpu: h100 + num_gpus: 1 + source_file_dependencies: + - tools/install_deepgemm.sh + - vllm/utils/deep_gemm.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization + - tests/kernels/quantization/test_block_fp8.py + - tests/kernels/moe/test_deepgemm.py + - tests/kernels/moe/test_batched_deepgemm.py + - tests/kernels/attention/test_deepgemm_attention.py + commands: + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py + +- label: Kernels Helion Test + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + source_file_dependencies: + - vllm/utils/import_utils.py + - tests/kernels/helion/ + commands: + - pip install helion + - pytest -v -s kernels/helion/ + +- label: Model Executor Test # 23min + timeout_in_minutes: 35 + torch_nightly: true + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/engine/arg_utils.py + - vllm/config/model.py + - vllm/model_executor + - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py + +- label: Benchmarks # 11min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + commands: + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ + +- label: Quantization Test # 70min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + commands: + # temporary install here since we need nightly, will move to requirements/test.in + # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.14.1 + - uv pip install --system conch-triton-kernels + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py + +- label: LM Eval Small Models # 53min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + autorun_on_main: true + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt + +- label: OpenAI API correctness # 10min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + - tools/ + commands: # LMEval+Transcription WER check + - bash ../tools/install_torchcodec_rocm.sh || exit 1 + - pytest -s entrypoints/openai/correctness/ + + +##### models test ##### + +- label: Basic Models Tests (Initialization) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_initialization.py + commands: + # Run a subset of model initialization tests + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Extra Initialization) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - vllm/transformers_utils/ + - tests/models/test_initialization.py + commands: + # Only when vLLM model source is modified - test initialization of a large + # subset of supported models (the complement of the small subset in the above + # test.) Also run if model initialization test file is modified + - pytest -v -s models/test_initialization.py \ + -k 'not test_can_initialize_small_subset' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Basic Models Tests (Other) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_terratorch.py + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + no_gpu: true + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Language Models Tests (Standard) + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + # Test standard language models, excluding a subset of slow tests + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Language Models Tests (Extra Standard) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + # Shard slow subset of standard language models tests. Only run when model + # source is modified, or when specified test files are modified + - pip freeze | grep -E 'torch' + - export TORCH_NCCL_BLOCKING_WAIT=1 + - pytest -v -s models/language -m 'core_model and slow_test' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Tests (Hybrid) %N + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + # Shard hybrid language model tests + - pytest -v -s models/language/generation \ + -m hybrid_model \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + +- label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test + +- label: Multi-Modal Processor Test (CPU) + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + - tests/models/registry.py + no_gpu: true + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + +- label: Multi-Modal Processor Test # 44min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/models/multimodal + - tests/models/registry.py + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + +- label: Multi-Modal Models Test (Standard) # 60min + timeout_in_minutes: 100 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py + - pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Accuracy Eval (Small Models) # 5min + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt + +- label: Multi-Modal Models Test (Extended) 1 # 60min + timeout_in_minutes: 120 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing + +- label: Multi-Modal Models Test (Extended) 2 #60min + timeout_in_minutes: 120 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models Test (Extended) 3 # 75min + timeout_in_minutes: 150 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + +- label: Quantized Models Test # 45 min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + commands: + - pytest -v -s models/quantization + +- label: Transformers Nightly Models Test + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/" + optional: true + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_transformers.py + # - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' + - python3 examples/basic/offline_inference/chat.py + # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # Whisper needs spawn method to avoid deadlock + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper + +- label: Blackwell Fusion and Compile Tests # 30 min + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/passes/test_fusion_attn.py + - tests/compile/passes/test_silu_mul_quant_fusion.py + - tests/compile/passes/distributed/test_fusion_all_reduce.py + - tests/compile/fullgraph/test_full_graph.py + commands: + - nvidia-smi + - pytest -v -s tests/compile/passes/test_fusion_attn.py + - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py + # this runner has 2 GPUs available even though num_gpus=2 is not set + - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py + + # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time + # # Wrap with quotes to escape yaml + # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 + # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. + + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + +- label: Blackwell GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: Blackwell Quantized MoE Test + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - tests/quantization/test_blackwell_moe.py + - vllm/model_executor/models/deepseek_v2.py + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/models/llama4.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization/compressed_tensors + - vllm/model_executor/layers/quantization/modelopt.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py + +##### 1 GPU test ##### +##### multi gpus test ##### + +- label: Distributed Comm Ops Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + +- label: 2 Node Tests (4 GPUs in total) # 16min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdmultinode] + agent_pool: mi325_4 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - tests/examples/offline_inference/data_parallel.py + commands: + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | grep 'Same node test passed' | grep 'Node count test passed' + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code + +- label: Distributed Tests (2 GPUs) # 68min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/fullgraph/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/distributed/ + - tests/entrypoints/llm/test_collective_rpc.py + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + - examples/offline_inference/new_weight_syncing/ + commands: + # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 + # TODO: Remove when the bug is fixed in a future ROCm release + - export TORCH_NCCL_BLOCKING_WAIT=1 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Model Tests (2 GPUs) # 37min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' + +- label: Plugin Tests (2 GPUs) # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # test bge_m3_sparse io_processor plugin + - pip install -e ./plugins/bge_m3_sparse_plugin + - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py + - pip uninstall bge_m3_sparse_plugin -y + # end io_processor plugins test + # begin stat_logger plugins test + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + # end stat_logger plugins test + # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins + +- label: Pipeline + Context Parallelism Test # 45min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + optional: true + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py + +- label: LoRA TP Test (Distributed) # 17 min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + optional: true + # grade: Blocking + num_gpus: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # There is some Tensor Parallelism related processing logic in LoRA that + # requires multi-GPU testing for validation. + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + + # Disabled for now because MXFP4 backend on non-cuda platform + # doesn't support LoRA yet + #- pytest -v -s -x lora/test_gptoss_tp.py + + +- label: Weight Loading Multiple GPU Test # 33min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt + +- label: Weight Loading Multiple GPU Test - Large Models # optional + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt + +- label: NixlConnector PD accuracy tests (Distributed) # 30min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + optional: true + # grade: Blocking + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + optional: true + # grade: Blocking + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +##### multi gpus test ##### +##### A100 test ##### + +- label: Distributed Tests (A100) # optional + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + gpu: a100 + optional: true + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876 + # TODO: Remove when the bug is fixed in a future ROCm release + - export TORCH_NCCL_BLOCKING_WAIT=1 + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + + +- label: LM Eval Large Models # optional + gpu: a100 + optional: true + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +##### FP8 test ##### +- label: LM Eval Large Models (H100) # optional, still use H100 for consistency + gpu: h100 + optional: true + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=4 + + +##### H200 test ##### +- label: Distributed Tests (H200) # optional + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + gpu: h200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py + - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py + # TODO: this test is not supported on ROCm, there are aiter kernels for this. + # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py + #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" + # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 + # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. + - pytest -v -s tests/distributed/test_context_parallel.py + - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization + # this test is not supported on ROCm + # - pytest -v -s tests/v1/distributed/test_dbo.py + +##### B200 test ##### +- label: Distributed Tests (B200) # optional + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py + +##### E2E Eval Tests ##### +- label: LM Eval Small Models (1 Card) # 15min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt + +- label: LM Eval Large Models (4 Card) + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + gpu: a100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +- label: ROCm LM Eval Large Models (8 Card) + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + optional: true + num_gpus: 8 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8 + +- label: ROCm GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/tests" + agent_pool: mi325_1 + mirror_hardwares: [amdexperimental, amdproduction] + optional: true # run on nightlies + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx942.txt + +##### EPLB Accuracy Tests ##### +- label: DeepSeek V2-Lite Accuracy + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 + +- label: Qwen3-30B-A3B-FP8-block Accuracy (H100) + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + +##### .buildkite/test_areas/compile.yaml ##### +# Slowly setting up the tests so that it is also easier for the +# CI team to review and upstream to the pipelinev2. +# The following tests are important for vLLM IR Ops refactoring, +# which affects fusion passes on ROCm. So we have to +# enable them as as soon as possible. + +## TODO: Enable the test in this group +# # corresponds to .buildkite/test_areas/compile.yaml +# - label: Fusion and Compile Unit Tests (2xMI325 GPUs) +# timeout_in_minutes: 20 +# working_dir: "/vllm-workspace/" +# mirror_hardwares: [amdexperimental, amdproduction, tj] +# agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs +# source_file_dependencies: +# - csrc/quantization/fp4/ +# - vllm/model_executor/layers/quantization/ +# - vllm/model_executor/layers/layernorm.py +# - vllm/model_executor/layers/activation.py +# - vllm/model_executor/layers/attention/attention.py +# - vllm/v1/attention/backends/flashinfer.py +# - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes +# - tests/compile/test_fusion_attn.py +# - tests/compile/test_silu_mul_quant_fusion.py +# - tests/compile/distributed/test_fusion_all_reduce.py +# - tests/compile/fullgraph/test_full_graph.py +# commands: +# - rocm-smi +# # we run all backend tests on ROCm +# # These two tests are covered in "PyTorch Compilation Passes Unit Tests" +# # - "pytest -v -s tests/compile/passes/test_fusion_attn.py" +# # - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py" +# # TODO: this test is not supported on ROCm, there are aiter kernels for this. +# # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py +# # TODO: find out more details +# # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + +# corresponds to .buildkite/test_areas/compile.yaml +- label: Fusion E2E Quick (MI325) + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + num_devices: 1 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/compilation/ + - tests/compile/fusions_e2e/ + commands: + - rocm-smi + # Run all models and attn backends but only Inductor partition and native custom ops + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" + # Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'" + +# corresponds to .buildkite/test_areas/compile.yaml +- label: Fusion E2E Config Sweep (MI325) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + num_devices: 1 + source_file_dependencies: + - csrc/quantization/ + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/fusions_e2e/ + commands: + - rocm-smi + # Run just llama3 (fp8) for all config combinations + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" + +## There are no ops on ROCm for these tests. +## The test still passes but the logs are not useful. +## fused ops just call torch.ops.symm_mem which +## exists in ROCm even though they don't work +# - label: AsyncTP Correctness Tests (2xMI325 GPUs) +# - label: Fusion E2E TP2 Quick (MI325) +# - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) +# - label: Fusion E2E TP2 (MI325) +# - label: Sequence Parallel Correctness Tests (2xMI325 GPUs) + + +##################################################################################################################################### +# # +# MI355 test definitions ( currently the test set is completely mirrored // TBD which tests are to be routed there ultimately) # +# # +##################################################################################################################################### - label: Pytorch Nightly Dependency Override Check # 2min # if this test fails, it means the nightly torch version is not compatible with some # of the dependencies. Please check the error message and add the package to whitelist # in /vllm/tools/pre_commit/generate_nightly_torch_test.py mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking + agent_pool: mi355_1 + optional: true soft_fail: true source_file_dependencies: - requirements/nightly_torch_test.txt @@ -51,8 +3143,8 @@ steps: - label: Async Engine, Inputs, Utils, Worker Test # 10min timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - vllm/ - tests/multimodal @@ -64,8 +3156,8 @@ steps: - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - vllm/ - tests/test_inputs.py @@ -94,8 +3186,8 @@ steps: - label: Python-only Installation Test # 10min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - tests/standalone_tests/python_only_compile.sh - setup.py @@ -105,8 +3197,8 @@ steps: - label: Basic Correctness Test # 20min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true fast_check: true torch_nightly: true source_file_dependencies: @@ -122,8 +3214,7 @@ steps: - label: Entrypoints Unit Tests # 5min mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking + agent_pool: mi355_1 timeout_in_minutes: 10 working_dir: "/vllm-workspace/tests" fast_check: true @@ -132,13 +3223,13 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true @@ -155,8 +3246,8 @@ steps: - label: Entrypoints Integration Test (API Server 1) # 100min timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true @@ -172,27 +3263,27 @@ steps: - label: Entrypoints Integration Test (API Server 2) timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true source_file_dependencies: - vllm/ - - tests/entrypoints/sleep - tests/entrypoints/rpc + - tests/entrypoints/instrumentator - tests/tool_use commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/sleep + - pytest -v -s entrypoints/instrumentator + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true @@ -206,8 +3297,8 @@ steps: - label: Entrypoints Integration Test (Responses API) timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true @@ -220,8 +3311,9 @@ steps: - label: Distributed Tests (4 GPUs) # 35min timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - agent_pool: mi325_4 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi355_4 + optional: true # grade: Blocking working_dir: "/vllm-workspace/tests" num_gpus: 4 @@ -276,15 +3368,16 @@ steps: - popd # NEW rlhf examples - pushd ../examples/offline_inference/new_weight_syncing - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py - popd - label: Distributed Tests (8 GPUs) # 4min timeout_in_minutes: 10 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 - # grade: Blocking + agent_pool: mi355_8 + optional: true gpu: h100 num_gpus: 8 working_dir: "/vllm-workspace/tests" @@ -304,8 +3397,8 @@ steps: - label: EPLB Algorithm Test # 5min mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking + agent_pool: mi355_1 + optional: true timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -316,8 +3409,8 @@ steps: - label: EPLB Execution Test # 10min mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking + agent_pool: mi355_4 + optional: true timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" num_gpus: 4 @@ -331,8 +3424,8 @@ steps: - label: Metrics, Tracing Test # 12min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_2 - # grade: Blocking + agent_pool: mi355_2 + optional: true num_gpus: 2 source_file_dependencies: - vllm/ @@ -351,8 +3444,7 @@ steps: - label: Regression Test # 7min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking + agent_pool: mi355_1 source_file_dependencies: - vllm/ - tests/test_regression @@ -364,39 +3456,66 @@ steps: - label: Engine Test # 9min timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 + agent_pool: mi355_1 + source_file_dependencies: + - vllm/ + - tests/engine + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + + +- label: V1 Test e2e + engine # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi355_1 + optional: true + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + - pytest -v -s v1/engine + +- label: V1 Test e2e (2 GPUs) # 65min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi355_2 + optional: true # grade: Blocking source_file_dependencies: - - vllm/ - - tests/engine - - tests/test_sequence - - tests/test_config - - tests/test_logger - - tests/test_vllm_port + - vllm/ + - tests/v1 commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + # Only run tests that need exactly 2 GPUs + - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" -- label: V1 Test e2e + engine # 65min +- label: V1 Test e2e (4 GPUs) # 65min timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. # See discussion here: https://github.com/vllm-project/vllm/pull/31040 - agent_pool: mi325_8 + agent_pool: mi355_4 + optional: true # grade: Blocking source_file_dependencies: - vllm/ - tests/v1 commands: - # TODO: accuracy does not match, whether setting - # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - - pytest -v -s v1/e2e - - pytest -v -s v1/engine + # Only run tests that need 4 GPUs + - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" - label: V1 Test entrypoints # 35min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - vllm/ - tests/v1 @@ -406,8 +3525,8 @@ steps: - label: V1 Test others # 42min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - vllm/ - tests/v1 @@ -430,12 +3549,10 @@ steps: - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine -# TODO: Add the "V1 Test attetion (MI300)" test group - - label: V1 Test attention (H100) # 10min mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true timeout_in_minutes: 30 gpu: h100 source_file_dependencies: @@ -448,7 +3565,7 @@ steps: - label: Batch Invariance Tests (H100) # 10min mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 + agent_pool: mi355_1 timeout_in_minutes: 25 gpu: h100 source_file_dependencies: @@ -462,6 +3579,8 @@ steps: - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py - label: V1 Test attention (B200) # 10min + mirror_hardwares: [amdexperimental, amdmi355] + agent_pool: mi355_1 timeout_in_minutes: 30 gpu: b200 source_file_dependencies: @@ -474,8 +3593,7 @@ steps: - label: V1 Test others (CPU) # 5 mins mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 - grade: Blocking + agent_pool: mi355_1 source_file_dependencies: - vllm/ - tests/v1 @@ -492,8 +3610,8 @@ steps: - label: Examples Test # 30min timeout_in_minutes: 45 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints @@ -502,12 +3620,12 @@ steps: commands: - pip install tensorizer # for tensorizer test # for basic - - python3 offline_inference/basic/chat.py - - python3 offline_inference/basic/generate.py --model facebook/opt-125m - - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py + - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 basic/offline_inference/classify.py + - python3 basic/offline_inference/embed.py + - python3 basic/offline_inference/score.py # for multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 @@ -527,8 +3645,8 @@ steps: - label: Platform Tests (CUDA) # 4min timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - vllm/ - tests/cuda @@ -539,21 +3657,20 @@ steps: - label: Samplers Test # 56min timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - vllm/model_executor/layers - vllm/sampling_metadata.py - tests/samplers - tests/conftest.py commands: - - pytest -v -s -m 'not skip_v1' samplers + - pytest -v -s samplers - label: LoRA Test %N # 20min each timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 - # grade: Blocking + agent_pool: mi355_1 source_file_dependencies: - vllm/lora - tests/lora @@ -573,8 +3690,8 @@ steps: - label: PyTorch Compilation Unit Tests # 15min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true torch_nightly: true source_file_dependencies: - vllm/ @@ -590,8 +3707,8 @@ steps: - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true torch_nightly: true source_file_dependencies: - vllm/ @@ -606,7 +3723,8 @@ steps: - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 + agent_pool: mi355_1 + optional: true # grade: Blocking torch_nightly: true source_file_dependencies: @@ -623,7 +3741,8 @@ steps: - label: Cudagraph test timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 + agent_pool: mi355_1 + optional: true source_file_dependencies: - tests/v1/cudagraph - vllm/v1/cudagraph_dispatcher.py @@ -636,8 +3755,8 @@ steps: - label: Kernels Core Operation Test # 48min timeout_in_minutes: 75 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - csrc/ - tests/kernels/core @@ -648,8 +3767,8 @@ steps: - label: Kernels Attention Test %N # 23min timeout_in_minutes: 35 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_8 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - csrc/attention/ - vllm/v1/attention @@ -663,8 +3782,8 @@ steps: - label: Kernels Quantization Test %N # 64min timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - csrc/quantization/ - vllm/model_executor/layers/quantization @@ -676,8 +3795,8 @@ steps: - label: Kernels MoE Test %N # 40min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_8 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - csrc/quantization/cutlass_w8a8/moe/ - csrc/moe/ @@ -690,11 +3809,19 @@ steps: - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 +- label: Kernels FP8 MoE Test + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi355_2 + optional: true + commands: + - pytest -v -s kernels/moe/test_deepep_moe.py + - label: Kernels Mamba Test # 31min timeout_in_minutes: 45 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - csrc/mamba/ - tests/kernels/mamba @@ -725,7 +3852,8 @@ steps: - label: Kernels Helion Test timeout_in_minutes: 30 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 + agent_pool: mi355_1 + optional: true source_file_dependencies: - vllm/utils/import_utils.py - tests/kernels/helion/ @@ -737,8 +3865,8 @@ steps: timeout_in_minutes: 35 torch_nightly: true mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - vllm/engine/arg_utils.py - vllm/config/model.py @@ -754,8 +3882,8 @@ steps: - label: Benchmarks # 11min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/.buildkite" source_file_dependencies: - benchmarks/ @@ -765,8 +3893,8 @@ steps: - label: Benchmarks CLI Test # 7min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - vllm/ - tests/benchmarks/ @@ -776,8 +3904,8 @@ steps: - label: Quantization Test # 70min timeout_in_minutes: 90 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization @@ -797,8 +3925,8 @@ steps: - label: LM Eval Small Models # 53min timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization @@ -809,8 +3937,8 @@ steps: - label: OpenAI API correctness # 10min timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - csrc/ - vllm/entrypoints/openai/ @@ -826,8 +3954,8 @@ steps: - label: Basic Models Tests (Initialization) timeout_in_minutes: 45 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true torch_nightly: true source_file_dependencies: - vllm/ @@ -839,8 +3967,8 @@ steps: - label: Basic Models Tests (Extra Initialization) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_8 - # grade: Blocking + agent_pool: mi355_1 + optional: true torch_nightly: true source_file_dependencies: - vllm/model_executor/models/ @@ -859,8 +3987,8 @@ steps: - label: Basic Models Tests (Other) timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true torch_nightly: true source_file_dependencies: - vllm/ @@ -872,8 +4000,8 @@ steps: - label: Basic Models Test (Other CPU) # 5min mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true timeout_in_minutes: 10 torch_nightly: true source_file_dependencies: @@ -887,8 +4015,8 @@ steps: - label: Language Models Tests (Standard) timeout_in_minutes: 25 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true torch_nightly: true source_file_dependencies: - vllm/ @@ -901,8 +4029,8 @@ steps: - label: Language Models Tests (Extra Standard) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 - # grade: Blocking + agent_pool: mi355_1 + optional: true torch_nightly: true source_file_dependencies: - vllm/model_executor/models/ @@ -922,8 +4050,8 @@ steps: - label: Language Models Tests (Hybrid) %N timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 - # grade: Blocking + agent_pool: mi355_1 + optional: true torch_nightly: true source_file_dependencies: - vllm/ @@ -943,8 +4071,7 @@ steps: - label: Language Models Test (Extended Generation) # 80min timeout_in_minutes: 110 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 optional: true source_file_dependencies: - vllm/ @@ -959,8 +4086,7 @@ steps: - label: Language Models Test (PPL) timeout_in_minutes: 110 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 optional: true source_file_dependencies: - vllm/ @@ -971,8 +4097,7 @@ steps: - label: Language Models Test (Extended Pooling) # 36min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 optional: true source_file_dependencies: - vllm/ @@ -983,7 +4108,7 @@ steps: - label: Language Models Test (MTEB) timeout_in_minutes: 110 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 + agent_pool: mi355_1 # grade: Blocking optional: true source_file_dependencies: @@ -995,7 +4120,8 @@ steps: - label: Multi-Modal Processor Test (CPU) timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 + agent_pool: mi355_1 + optional: true source_file_dependencies: - vllm/ - tests/models/multimodal @@ -1007,8 +4133,8 @@ steps: - label: Multi-Modal Processor Test # 44min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - vllm/ - tests/models/multimodal @@ -1019,8 +4145,8 @@ steps: - label: Multi-Modal Models Test (Standard) # 60min timeout_in_minutes: 100 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true torch_nightly: true source_file_dependencies: - vllm/ @@ -1037,8 +4163,8 @@ steps: - label: Multi-Modal Accuracy Eval (Small Models) # 5min timeout_in_minutes: 10 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - vllm/multimodal/ @@ -1052,8 +4178,7 @@ steps: - label: Multi-Modal Models Test (Extended) 1 # 60min timeout_in_minutes: 120 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 optional: true source_file_dependencies: - vllm/ @@ -1067,8 +4192,7 @@ steps: - label: Multi-Modal Models Test (Extended) 2 #60min timeout_in_minutes: 120 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 optional: true source_file_dependencies: - vllm/ @@ -1082,8 +4206,7 @@ steps: - label: Multi-Modal Models Test (Extended) 3 # 75min timeout_in_minutes: 150 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 optional: true source_file_dependencies: - vllm/ @@ -1097,30 +4220,17 @@ steps: - label: Quantized Models Test # 45 min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 + optional: true source_file_dependencies: - vllm/model_executor/layers/quantization - tests/models/quantization commands: - pytest -v -s models/quantization -# This test is used only in PR development phase to test individual models and should never run on main -- label: Custom Models Test - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - optional: true - commands: - - echo 'Testing custom models...' - # PR authors can temporarily add commands below to test individual models - # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py - # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* - - label: Transformers Nightly Models Test mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 working_dir: "/vllm-workspace/" optional: true commands: @@ -1129,12 +4239,14 @@ steps: - pytest -v -s tests/models/test_transformers.py # - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - - python3 examples/offline_inference/basic/chat.py + - python3 examples/basic/offline_inference/chat.py # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper -- label: Blackwell Test # 21 min +- label: Blackwell Test (MI355) # 21 min + mirror_hardwares: [amdexperimental, amdmi355] + agent_pool: mi355_1 timeout_in_minutes: 30 working_dir: "/vllm-workspace/" gpu: b200 @@ -1153,28 +4265,28 @@ steps: - vllm/v1/attention/selector.py - vllm/platforms/cuda.py commands: - - nvidia-smi - - python3 examples/offline_inference/basic/chat.py + - rocm-smi + - python3 examples/basic/offline_inference/chat.py # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - - pytest -v -s tests/kernels/attention/test_attention_selector.py - - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py - # Quantization - - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py - - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py - - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - - pytest -v -s tests/kernels/moe/test_flashinfer.py - - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py + - pytest -v -s tests/kernels/attention/test_attention_selector.py + #- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + #- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + #- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + #- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + ## Quantization + #- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + #- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + #- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + #- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + #- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + #- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + #- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + #- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py + #- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + #- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + #- pytest -v -s tests/kernels/moe/test_flashinfer.py + #- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 @@ -1244,13 +4356,15 @@ steps: - label: Blackwell LM Eval Small Models timeout_in_minutes: 120 + mirror_hardwares: [amdexperimental, amdproduction, amdmi355] + agent_pool: mi355_2 gpu: b200 optional: true # run on nightlies source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi355.txt ##### 1 GPU test ##### ##### multi gpus test ##### @@ -1258,8 +4372,8 @@ steps: - label: Distributed Comm Ops Test # 7min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_2 - # grade: Blocking + agent_pool: mi355_2 + optional: true working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: @@ -1274,8 +4388,7 @@ steps: - label: 2 Node Tests (4 GPUs in total) # 16min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental, amdmultinode] - agent_pool: mi325_4 - # grade: Blocking + agent_pool: mi355_4 working_dir: "/vllm-workspace/tests" num_gpus: 2 num_nodes: 2 @@ -1300,8 +4413,9 @@ steps: - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental] - agent_pool: mi325_2 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi355_2 + optional: true # grade: Blocking working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -1334,15 +4448,15 @@ steps: - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - pytest -v -s distributed/test_sequence_parallel.py + - pytest -v -s compile/correctness_e2e/test_sequence_parallel.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py - label: Distributed Model Tests (2 GPUs) # 37min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] - agent_pool: mi325_2 - # grade: Blocking + agent_pool: mi355_2 + optional: true working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: @@ -1363,8 +4477,8 @@ steps: - label: Plugin Tests (2 GPUs) # 40min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_2 - # grade: Blocking + agent_pool: mi355_2 + optional: true working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: @@ -1380,6 +4494,10 @@ steps: - pip install -e ./plugins/prithvi_io_processor_plugin - pytest -v -s plugins_tests/test_io_processor_plugins.py - pip uninstall prithvi_io_processor_plugin -y + # test bge_m3_sparse io_processor plugin + - pip install -e ./plugins/bge_m3_sparse_plugin + - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py + - pip uninstall bge_m3_sparse_plugin -y # end io_processor plugins test # begin stat_logger plugins test - pip install -e ./plugins/vllm_add_dummy_stat_logger @@ -1397,8 +4515,8 @@ steps: - label: Pipeline + Context Parallelism Test # 45min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking + agent_pool: mi355_4 + optional: true working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: @@ -1414,8 +4532,8 @@ steps: - label: LoRA TP Test (Distributed) # 17 min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking + agent_pool: mi355_4 + optional: true num_gpus: 4 source_file_dependencies: - vllm/lora @@ -1439,8 +4557,7 @@ steps: - label: Weight Loading Multiple GPU Test # 33min timeout_in_minutes: 45 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_2 - # grade: Blocking + agent_pool: mi355_2 working_dir: "/vllm-workspace/tests" num_gpus: 2 optional: true @@ -1452,8 +4569,7 @@ steps: - label: Weight Loading Multiple GPU Test - Large Models # optional mirror_hardwares: [amdexperimental] - agent_pool: mi325_2 - # grade: Blocking + agent_pool: mi355_2 working_dir: "/vllm-workspace/tests" num_gpus: 2 optional: true @@ -1465,8 +4581,8 @@ steps: - label: NixlConnector PD accuracy tests (Distributed) # 30min mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking + agent_pool: mi355_4 + optional: true timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_gpus: 4 @@ -1479,8 +4595,8 @@ steps: - label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking + agent_pool: mi355_4 + optional: true timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" num_gpus: 4 @@ -1491,13 +4607,26 @@ steps: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh +- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi355_4 + # grade: Blocking + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt + - CROSS_LAYERS_BLOCKS=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + ##### multi gpus test ##### ##### A100 test ##### - label: Distributed Tests (A100) # optional mirror_hardwares: [amdexperimental] - agent_pool: mi325_4 - # grade: Blocking + agent_pool: mi355_4 gpu: a100 optional: true num_gpus: 4 @@ -1519,8 +4648,7 @@ steps: gpu: a100 optional: true mirror_hardwares: [amdexperimental] - agent_pool: mi325_4 - # grade: Blocking + agent_pool: mi355_4 num_gpus: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: @@ -1535,8 +4663,7 @@ steps: gpu: h100 optional: true mirror_hardwares: [amdexperimental] - agent_pool: mi325_4 - # grade: Blocking + agent_pool: mi355_4 num_gpus: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: @@ -1550,8 +4677,7 @@ steps: ##### H200 test ##### - label: Distributed Tests (H200) # optional mirror_hardwares: [amdexperimental] - agent_pool: mi325_2 - # grade: Blocking + agent_pool: mi355_2 gpu: h200 optional: true working_dir: "/vllm-workspace/" @@ -1585,8 +4711,7 @@ steps: - label: LM Eval Small Models (1 Card) # 15min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking + agent_pool: mi355_1 source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization @@ -1595,8 +4720,7 @@ steps: - label: LM Eval Large Models (4 Card) mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking + agent_pool: mi355_4 gpu: a100 optional: true num_gpus: 4 @@ -1610,7 +4734,8 @@ steps: - label: ROCm LM Eval Large Models (8 Card) mirror_hardwares: [amdproduction] - agent_pool: mi325_8 + agent_pool: mi355_8 + optional: true num_gpus: 8 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" commands: @@ -1619,8 +4744,8 @@ steps: - label: ROCm GPT-OSS Eval timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - agent_pool: mi325_1 + working_dir: "/vllm-workspace/tests" + agent_pool: mi355_1 mirror_hardwares: [amdexperimental, amdproduction] optional: true # run on nightlies source_file_dependencies: @@ -1629,29 +4754,13 @@ steps: - vllm/model_executor/layers/quantization/mxfp4.py - vllm/v1/attention/backends/flashinfer.py commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - -##### RL Integration Tests ##### -- label: Prime-RL Integration Test # 15min - mirror_hardwares: [amdexperimental] - agent_pool: mi325_2 - # grade: Blocking - timeout_in_minutes: 30 - optional: true - num_gpus: 2 - working_dir: "/vllm-workspace" - source_file_dependencies: - - vllm/ - - .buildkite/scripts/run-prime-rl-test.sh - commands: - - bash .buildkite/scripts/run-prime-rl-test.sh + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx950.txt ##### EPLB Accuracy Tests ##### - label: DeepSeek V2-Lite Accuracy mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking + agent_pool: mi355_4 timeout_in_minutes: 60 gpu: h100 optional: true @@ -1660,19 +4769,9 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 -- label: Qwen3-30B-A3B-FP8-block Accuracy (H100) - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - -- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) +- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355) + mirror_hardwares: [amdexperimental, amdproduction, amdmi355] + agent_pool: mi355_2 timeout_in_minutes: 60 gpu: b200 optional: true @@ -1685,10 +4784,24 @@ steps: - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] - agent_pool: mi325_4 - # grade: Blocking + agent_pool: mi355_4 optional: true num_gpus: 4 working_dir: "/vllm-workspace" commands: - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + +- label: Attention Benchmarks Smoke Test (B200-MI355) + device: b200 + mirror_hardwares: [amdexperimental, amdmi355] + agent_pool: mi355_2 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/" + timeout_in_minutes: 10 + source_file_dependencies: + - benchmarks/attention_benchmarks/ + - vllm/v1/attention/ + commands: + - python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1 + diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 73d4cf80c413e84f065a5bc802907e0fdbeb8bf9..b0a7ba8aa68f5879a00d2745cd85eacce80dd5be 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1,1522 +1,8 @@ -# In this file, you can add more tests to run either by adding a new step or -# adding a new command to an existing step. See different options here for examples. +# This file has been deprecated as of Feb 18, 2026. The content has already been migrated to: -# This script will be feed into Jinja template in `test-template-aws.j2` at -# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 -# to generate the final pipeline yaml file. +# .buildkite/test_areas for test jobs +# .buildkite/image_build for image building jobs +# .buildkite/hardware_tests for jobs running on other hardwares (Intel, Ascend NPU, Arm, etc..) +# .buildkite/ci_config.yaml for configuration of CI pipeline -# Documentation -# label(str): the name of the test. emojis allowed. -# fast_check(bool): whether to run this on each commit on the fastcheck pipeline. -# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline. -# fast_check_only(bool): run this test on the fastcheck pipeline only -# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run. -# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests). -# command(str): the single command to run for tests. incompatible with commands. -# commands(list): the list of commands to run for the test. incompatible with command. -# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental] -# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200 -# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4. -# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host, -# in this case, commands must be specified. the first command runs on the first host, the second -# command runs on the second host. -# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout. -# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB -# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables. -# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests -# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run. -# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch. - -# When adding a test -# - If the test belongs to an existing group, add it there -# - If the test is short, add to any existing step -# - If the test takes more than 10min, then it is okay to create a new step. -# Note that all steps execute in parallel. - -steps: -##### fast check tests ##### - -- label: Pytorch Nightly Dependency Override Check # 2min - # if this test fails, it means the nightly torch version is not compatible with some - # of the dependencies. Please check the error message and add the package to whitelist - # in /vllm/tools/pre_commit/generate_nightly_torch_test.py - soft_fail: true - source_file_dependencies: - - requirements/nightly_torch_test.txt - commands: - - bash standalone_tests/pytorch_nightly_dependency.sh - -- label: Async Engine, Inputs, Utils, Worker Test # 36min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/multimodal - - tests/utils_ - commands: - - pytest -v -s -m 'not cpu_test' multimodal - - pytest -v -s utils_ - -- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min - timeout_in_minutes: 30 - source_file_dependencies: - - vllm/ - - tests/test_inputs.py - - tests/test_outputs.py - - tests/test_pooling_params.py - - tests/multimodal - - tests/renderers - - tests/standalone_tests/lazy_imports.py - - tests/tokenizers_ - - tests/tool_parsers - - tests/transformers_utils - - tests/config - no_gpu: true - commands: - - python3 standalone_tests/lazy_imports.py - - pytest -v -s test_inputs.py - - pytest -v -s test_outputs.py - - pytest -v -s test_pooling_params.py - - pytest -v -s -m 'cpu_test' multimodal - - pytest -v -s renderers - - pytest -v -s tokenizers_ - - pytest -v -s tool_parsers - - pytest -v -s transformers_utils - - pytest -v -s config - -- label: Python-only Installation Test # 10min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - tests/standalone_tests/python_only_compile.sh - - setup.py - commands: - - bash standalone_tests/python_only_compile.sh - -- label: Basic Correctness Test # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/basic_correctness/test_basic_correctness - - tests/basic_correctness/test_cpu_offload - - tests/basic_correctness/test_cumem.py - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s basic_correctness/test_cumem.py - - pytest -v -s basic_correctness/test_basic_correctness.py - - pytest -v -s basic_correctness/test_cpu_offload.py - -- label: Entrypoints Unit Tests # 5min - timeout_in_minutes: 10 - working_dir: "/vllm-workspace/tests" - fast_check: true - source_file_dependencies: - - vllm/entrypoints - - tests/entrypoints/ - commands: - - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - -- label: Entrypoints Integration Test (LLM) # 30min - timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/llm - - tests/entrypoints/offline_mode - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - -- label: Entrypoints Integration Test (API Server 1) # 100min - timeout_in_minutes: 130 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai - - tests/entrypoints/test_chat_utils - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses - - pytest -v -s entrypoints/test_chat_utils.py - -- label: Entrypoints Integration Test (API Server 2) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/sleep - - tests/entrypoints/rpc - - tests/tool_use - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/sleep - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - - pytest -v -s tool_use - -- label: Entrypoints Integration Test (Pooling) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/pooling - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/pooling - -- label: Entrypoints Integration Test (Responses API) - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - fast_check: true - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/entrypoints/openai/responses - commands: - - pytest -v -s entrypoints/openai/responses - -- label: Distributed Tests (4 GPUs) # 35min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/ - - tests/distributed/test_utils - - tests/distributed/test_pynccl - - tests/distributed/test_events - - tests/compile/fullgraph/test_basic_correctness.py - - examples/offline_inference/rlhf.py - - examples/offline_inference/rlhf_colocate.py - - examples/offline_inference/new_weight_syncing/ - - tests/examples/offline_inference/data_parallel.py - - tests/v1/distributed - - tests/v1/engine/test_engine_core_client.py - - tests/distributed/test_symm_mem_allreduce.py - commands: - # https://github.com/NVIDIA/nccl/issues/1838 - - export NCCL_CUMEM_HOST_ENABLE=0 - # test with torchrun tp=2 and external_dp=2 - - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - # test with torchrun tp=2 and pp=2 - - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - # test with torchrun tp=4 and dp=1 - - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with torchrun tp=2, pp=2 and dp=1 - - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with torchrun tp=1 and dp=4 with ep - - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with torchrun tp=2 and dp=2 with ep - - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - # test with internal dp - - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/fullgraph/test_basic_correctness.py - - pytest -v -s distributed/test_pynccl.py - - pytest -v -s distributed/test_events.py - - pytest -v -s distributed/test_symm_mem_allreduce.py - # TODO: create a dedicated test section for multi-GPU example tests - # when we have multiple distributed example tests - # OLD rlhf examples - - pushd ../examples/offline_inference - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - - popd - # NEW rlhf examples - - pushd ../examples/offline_inference/new_weight_syncing - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py - - popd - -- label: Distributed Tests (8 GPUs) # 4min - timeout_in_minutes: 10 - gpu: h100 - num_gpus: 8 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - examples/offline_inference/torchrun_dp_example.py - - vllm/config/parallel.py - - vllm/distributed/ - - vllm/v1/engine/llm_engine.py - - vllm/v1/executor/uniproc_executor.py - - vllm/v1/worker/gpu_worker.py - commands: - # https://github.com/NVIDIA/nccl/issues/1838 - - export NCCL_CUMEM_HOST_ENABLE=0 - # test with torchrun tp=2 and dp=4 with ep - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - -- label: EPLB Algorithm Test # 5min - timeout_in_minutes: 15 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/distributed/eplb - - tests/distributed/test_eplb_algo.py - commands: - - pytest -v -s distributed/test_eplb_algo.py - -- label: EPLB Execution Test # 10min - timeout_in_minutes: 20 - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/eplb - - tests/distributed/test_eplb_execute.py - commands: - - pytest -v -s distributed/test_eplb_execute.py - - pytest -v -s distributed/test_eplb_spec_decode.py - -- label: Metrics, Tracing Test # 12min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - num_gpus: 2 - source_file_dependencies: - - vllm/ - - tests/v1/tracing - commands: - - "pip install \ - 'opentelemetry-sdk>=1.26.0' \ - 'opentelemetry-api>=1.26.0' \ - 'opentelemetry-exporter-otlp>=1.26.0' \ - 'opentelemetry-semantic-conventions-ai>=0.4.1'" - - pytest -v -s v1/tracing - -##### fast check tests ##### -##### 1 GPU test ##### - -- label: Regression Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/test_regression - commands: - - pip install modelscope - - pytest -v -s test_regression.py - working_dir: "/vllm-workspace/tests" # optional - -- label: Engine Test # 9min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/engine - - tests/test_sequence - - tests/test_config - - tests/test_logger - - tests/test_vllm_port - commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - -- label: V1 Test e2e + engine # 30min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - # TODO: accuracy does not match, whether setting - # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - - pytest -v -s v1/e2e - # Run this test standalone for now; - # need to untangle use (implicit) use of spawn/fork across the tests. - - pytest -v -s v1/engine/test_preprocess_error_handling.py - - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py - -- label: V1 Test entrypoints # 35min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - pytest -v -s v1/entrypoints - -- label: V1 Test others # 42min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - # split the test to avoid interference - - pytest -v -s -m 'not cpu_test' v1/core - - pytest -v -s v1/executor - - pytest -v -s v1/kv_offload - - pytest -v -s v1/sample - - pytest -v -s v1/logits_processors - - pytest -v -s v1/worker - - pytest -v -s -m 'not slow_test' v1/spec_decode - - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'not cpu_test' v1/metrics - - pytest -v -s v1/test_oracle.py - - pytest -v -s v1/test_request.py - - pytest -v -s v1/test_outputs.py - # Integration test for streaming correctness (requires special branch). - - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - -- label: V1 Test attention (H100) # 10min - timeout_in_minutes: 30 - gpu: h100 - source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention - commands: - - pytest -v -s v1/attention - -- label: Batch Invariance Tests (H100) # 10min - timeout_in_minutes: 25 - gpu: h100 - source_file_dependencies: - - vllm/v1/attention - - vllm/model_executor/layers - - tests/v1/determinism/ - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pip install pytest-timeout pytest-forked - - pytest -v -s v1/determinism/test_batch_invariance.py - - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py - -- label: V1 Test attention (B200) # 10min - timeout_in_minutes: 30 - gpu: b200 - source_file_dependencies: - - vllm/config/attention.py - - vllm/model_executor/layers/attention - - vllm/v1/attention - - tests/v1/attention - commands: - - pytest -v -s v1/attention - -- label: V1 Test others (CPU) # 5 mins - source_file_dependencies: - - vllm/ - - tests/v1 - no_gpu: true - commands: - # split the test to avoid interference - - pytest -v -s -m 'cpu_test' v1/core - - pytest -v -s v1/structured_output - - pytest -v -s v1/test_serial_utils.py - - pytest -v -s -m 'cpu_test' v1/kv_connector/unit - - pytest -v -s -m 'cpu_test' v1/metrics - - -- label: Examples Test # 30min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/examples" - source_file_dependencies: - - vllm/entrypoints - - vllm/multimodal - - examples/ - commands: - - pip install tensorizer # for tensorizer test - # for basic - - python3 offline_inference/basic/chat.py - - python3 offline_inference/basic/generate.py --model facebook/opt-125m - - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py - # for multi-modal models - - python3 offline_inference/audio_language.py --seed 0 - - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - # for pooling models - - python3 pooling/embed/vision_embedding_offline.py --seed 0 - # for features demo - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - -- label: Platform Tests (CUDA) # 4min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/cuda - commands: - - pytest -v -s cuda/test_cuda_context.py - -- label: Samplers Test # 56min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/model_executor/layers - - vllm/sampling_metadata.py - - tests/samplers - - tests/conftest.py - commands: - - pytest -v -s samplers - - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - -- label: LoRA Test %N # 20min each - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/lora - - tests/lora - commands: - - pytest -v -s lora \ - --shard-id=$$BUILDKITE_PARALLEL_JOB \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --ignore=lora/test_chatglm3_tp.py \ - --ignore=lora/test_llama_tp.py \ - --ignore=lora/test_llm_with_multi_loras.py \ - --ignore=lora/test_olmoe_tp.py \ - --ignore=lora/test_deepseekv2_tp.py \ - --ignore=lora/test_gptoss_tp.py \ - --ignore=lora/test_qwen3moe_tp.py - - parallelism: 4 - -- label: PyTorch Compilation Unit Tests # 15min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/compile - commands: - # Run unit tests defined directly under compile/, - # not including subdirectories, which are usually heavier - # tests covered elsewhere. - # Use `find` to launch multiple instances of pytest so that - # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - # However, find does not normally propagate error codes, so we combine it with xargs - # (using -0 for proper path handling) - - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - - pytest -s -v compile/passes --ignore compile/passes/distributed - -- label: PyTorch Fullgraph Smoke Test # 15min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/compile - commands: - # Run smoke tests under fullgraph directory, except test_full_graph.py - # as it is a heavy test that is covered in other steps. - # Use `find` to launch multiple instances of pytest so that - # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - # However, find does not normally propagate error codes, so we combine it with xargs - # (using -0 for proper path handling) - - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - -- label: PyTorch Fullgraph Test # 27min - timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/compile - commands: - # fp8 kv scales not supported on sm89, tested on Blackwell instead - - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - # # Limit to no custom ops to reduce running time - # # Wrap with quotes to escape yaml and avoid starting -k string with a - - # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 - # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - -- label: Cudagraph test - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - tests/v1/cudagraph - - vllm/v1/cudagraph_dispatcher.py - - vllm/config/compilation.py - - vllm/compilation - commands: - - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py - - pytest -v -s v1/cudagraph/test_cudagraph_mode.py - -- label: Kernels Core Operation Test # 48min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/ - - tests/kernels/core - - tests/kernels/test_top_k_per_row.py - commands: - - pytest -v -s kernels/core kernels/test_top_k_per_row.py - -- label: Kernels Attention Test %N # 23min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/attention/ - - vllm/v1/attention - # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) - - vllm/model_executor/layers/attention - - tests/kernels/attention - commands: - - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 2 - -- label: Kernels Quantization Test %N # 64min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/quantization/ - - vllm/model_executor/layers/quantization - - tests/kernels/quantization - commands: - - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 2 - -- label: Kernels MoE Test %N # 40min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/quantization/cutlass_w8a8/moe/ - - csrc/moe/ - - tests/kernels/moe - - vllm/model_executor/layers/fused_moe/ - - vllm/distributed/device_communicators/ - - vllm/envs.py - - vllm/config - commands: - - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 2 - -- label: Kernels Mamba Test # 31min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/mamba/ - - tests/kernels/mamba - - vllm/model_executor/layers/mamba/ops - commands: - - pytest -v -s kernels/mamba - -- label: Kernels DeepGEMM Test (H100) - timeout_in_minutes: 45 - gpu: h100 - num_gpus: 1 - source_file_dependencies: - - tools/install_deepgemm.sh - - vllm/utils/deep_gemm.py - - vllm/model_executor/layers/fused_moe - - vllm/model_executor/layers/quantization - - tests/kernels/quantization/test_block_fp8.py - - tests/kernels/moe/test_deepgemm.py - - tests/kernels/moe/test_batched_deepgemm.py - - tests/kernels/attention/test_deepgemm_attention.py - commands: - - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm - - pytest -v -s kernels/moe/test_deepgemm.py - - pytest -v -s kernels/moe/test_batched_deepgemm.py - - pytest -v -s kernels/attention/test_deepgemm_attention.py - -- label: Kernels Helion Test - timeout_in_minutes: 30 - gpu: h100 - source_file_dependencies: - - vllm/utils/import_utils.py - - tests/kernels/helion/ - commands: - - pip install helion - - pytest -v -s kernels/helion/ - - -- label: Kernels FP8 MoE Test (1 H100) - timeout_in_minutes: 90 - gpu: h100 - num_gpus: 1 - optional: true - commands: - - pytest -v -s kernels/moe/test_cutlass_moe.py - - pytest -v -s kernels/moe/test_flashinfer.py - - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py - - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py - - pytest -v -s kernels/moe/test_moe.py - # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main - - pytest -v -s kernels/moe/test_block_int8.py - - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py - - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py - -- label: Kernels FP8 MoE Test (2 H100s) - timeout_in_minutes: 90 - gpu: h100 - num_gpus: 2 - optional: true - commands: - - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py - - pytest -v -s kernels/moe/test_deepep_moe.py - - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py - # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main - -- label: Kernels Fp4 MoE Test (B200) - timeout_in_minutes: 60 - gpu: b200 - num_gpus: 1 - optional: true - commands: - - pytest -v -s kernels/moe/test_cutedsl_moe.py - - pytest -v -s kernels/moe/test_flashinfer_moe.py - - pytest -v -s kernels/moe/test_nvfp4_moe.py - - pytest -v -s kernels/moe/test_ocp_mx_moe.py - - -- label: Model Executor Test # 23min - timeout_in_minutes: 35 - torch_nightly: true - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/engine/arg_utils.py - - vllm/config/model.py - - vllm/model_executor - - tests/model_executor - - tests/entrypoints/openai/test_tensorizer_entrypoint.py - commands: - - apt-get update && apt-get install -y curl libsodium23 - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s model_executor - - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - -- label: Benchmarks # 11min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/.buildkite" - source_file_dependencies: - - benchmarks/ - commands: - - bash scripts/run-benchmarks.sh - -- label: Benchmarks CLI Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/ - - tests/benchmarks/ - commands: - - pytest -v -s benchmarks/ - -- label: Quantization Test # 70min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - tests/quantization - commands: - # temporary install here since we need nightly, will move to requirements/test.in - # after torchao 0.12 release, and pin a working version of torchao nightly here - - # since torchao nightly is only compatible with torch nightly currently - # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now - # we can only upgrade after this is resolved - # TODO(jerryzh168): resolve the above comment - - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129 - - uv pip install --system conch-triton-kernels - - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - -- label: LM Eval Small Models # 53min - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - autorun_on_main: true - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - -- label: OpenAI API correctness # 22min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - csrc/ - - vllm/entrypoints/openai/ - - vllm/model_executor/models/whisper.py - commands: # LMEval+Transcription WER check - - pytest -s entrypoints/openai/correctness/ - -##### models test ##### - -- label: Basic Models Tests (Initialization) - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/test_initialization.py - - tests/models/registry.py - commands: - # Run a subset of model initialization tests - - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset - -- label: Basic Models Tests (Extra Initialization) %N - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/model_executor/models/ - - vllm/transformers_utils/ - - tests/models/test_initialization.py - - tests/models/registry.py - commands: - # Only when vLLM model source is modified - test initialization of a large - # subset of supported models (the complement of the small subset in the above - # test.) Also run if model initialization test file is modified - - pytest -v -s models/test_initialization.py \ - -k 'not test_can_initialize_small_subset' \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB - parallelism: 2 - -- label: Basic Models Tests (Other) - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/test_terratorch.py - - tests/models/test_transformers.py - - tests/models/test_registry.py - commands: - - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py - -- label: Basic Models Test (Other CPU) # 5min - timeout_in_minutes: 10 - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/test_utils.py - - tests/models/test_vision.py - no_gpu: true - commands: - - pytest -v -s models/test_utils.py models/test_vision.py - -- label: Language Models Tests (Standard) - timeout_in_minutes: 25 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/language - commands: - # Test standard language models, excluding a subset of slow tests - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and (not slow_test)' - -- label: Language Models Tests (Extra Standard) %N - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/model_executor/models/ - - tests/models/language/pooling/test_embedding.py - - tests/models/language/generation/test_common.py - - tests/models/language/pooling/test_classification.py - commands: - # Shard slow subset of standard language models tests. Only run when model - # source is modified, or when specified test files are modified - - pip freeze | grep -E 'torch' - - pytest -v -s models/language -m 'core_model and slow_test' \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB - parallelism: 2 - -- label: Language Models Tests (Hybrid) %N - timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/language/generation - commands: - # Install fast path packages for testing against transformers - # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - # Shard hybrid language model tests - - pytest -v -s models/language/generation \ - -m hybrid_model \ - --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ - --shard-id=$$BUILDKITE_PARALLEL_JOB - parallelism: 2 - -- label: Language Models Test (Extended Generation) # 80min - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/generation - commands: - # Install fast path packages for testing against transformers - # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - -- label: Language Models Test (PPL) - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/generation_ppl_test - commands: - - pytest -v -s models/language/generation_ppl_test - -- label: Language Models Test (Extended Pooling) # 36min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/pooling - commands: - - pytest -v -s models/language/pooling -m 'not core_model' - -- label: Language Models Test (MTEB) - timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/language/pooling_mteb_test - commands: - - pytest -v -s models/language/pooling_mteb_test - -- label: Multi-Modal Processor Test (CPU) - timeout_in_minutes: 60 - source_file_dependencies: - - vllm/ - - tests/models/multimodal - no_gpu: true - commands: - - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'" - - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py - -- label: Multi-Modal Processor Test - timeout_in_minutes: 60 - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing/test_tensor_schema.py - -- label: Multi-Modal Models Test (Standard) # 60min - timeout_in_minutes: 80 - mirror_hardwares: [amdexperimental] - torch_nightly: true - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - -- label: Multi-Modal Accuracy Eval (Small Models) # 50min - timeout_in_minutes: 70 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - vllm/multimodal/ - - vllm/inputs/ - - vllm/v1/core/ - commands: - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - -- label: Multi-Modal Models Test (Extended) 1 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing - -- label: Multi-Modal Models Test (Extended) 2 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - -- label: Multi-Modal Models Test (Extended) 3 - mirror_hardwares: [amdexperimental] - optional: true - source_file_dependencies: - - vllm/ - - tests/models/multimodal - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - -- label: Quantized Models Test # 45 min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/model_executor/layers/quantization - - tests/models/quantization - commands: - - pytest -v -s models/quantization - -# This test is used only in PR development phase to test individual models and should never run on main -- label: Custom Models Test - mirror_hardwares: [amdexperimental] - optional: true - commands: - - echo 'Testing custom models...' - # PR authors can temporarily add commands below to test individual models - # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py - # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* - -- label: Transformers Nightly Models Test - working_dir: "/vllm-workspace/" - optional: true - soft_fail: true - commands: - - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py - - pytest -v -s tests/models/test_transformers.py - - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py - - python3 examples/offline_inference/basic/chat.py - - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - # Whisper needs spawn method to avoid deadlock - - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - -- label: Blackwell Test # 23 min - timeout_in_minutes: 30 - working_dir: "/vllm-workspace/" - gpu: b200 - source_file_dependencies: - - csrc/quantization/fp4/ - - csrc/attention/mla/ - - csrc/quantization/cutlass_w8a8/moe/ - - vllm/model_executor/layers/fused_moe/cutlass_moe.py - - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/v1/attention/backends/mla/cutlass_mla.py - - vllm/v1/attention/backends/mla/flashinfer_mla.py - - vllm/v1/attention/selector.py - - vllm/platforms/cuda.py - commands: - - nvidia-smi - - python3 examples/offline_inference/basic/chat.py - # Attention - # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - - pytest -v -s tests/kernels/attention/test_attention_selector.py - - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py - # Quantization - - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py - - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py - - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - - pytest -v -s tests/kernels/moe/test_flashinfer.py - - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - # e2e - - pytest -v -s tests/models/quantization/test_nvfp4.py - -- label: Blackwell Fusion and Compile Tests # 30 min - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - gpu: b200 - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/v1/worker/ - - vllm/v1/cudagraph_dispatcher.py - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/test_fusion_attn.py - - tests/compile/test_silu_mul_quant_fusion.py - - tests/compile/passes/distributed/test_fusion_all_reduce.py - - tests/compile/fullgraph/test_full_graph.py - commands: - - nvidia-smi - - pytest -v -s tests/compile/test_fusion_attn.py - - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py - # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time - # # Wrap with quotes to escape yaml - # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" - # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 - # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - -- label: Blackwell GPT-OSS Eval - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - gpu: b200 - optional: true # run on nightlies - source_file_dependencies: - - tests/evals/gpt_oss - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - -- label: Blackwell Quantized MoE Test - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - gpu: b200 - source_file_dependencies: - - tests/quantization/test_blackwell_moe.py - - vllm/model_executor/models/deepseek_v2.py - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/models/llama4.py - - vllm/model_executor/layers/fused_moe - - vllm/model_executor/layers/quantization/compressed_tensors - - vllm/model_executor/layers/quantization/modelopt.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - pytest -s -v tests/quantization/test_blackwell_moe.py - -- label: Blackwell LM Eval Small Models - timeout_in_minutes: 120 - gpu: b200 - optional: true # run on nightlies - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt - -##### 1 GPU test ##### -##### multi gpus test ##### - -- label: Distributed Comm Ops Test # 7min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/distributed - - tests/distributed - commands: - - pytest -v -s distributed/test_comm_ops.py - - pytest -v -s distributed/test_shm_broadcast.py - - pytest -v -s distributed/test_shm_buffer.py - - pytest -v -s distributed/test_shm_storage.py - - pytest -v -s distributed/test_packed_tensor.py - - pytest -v -s distributed/test_weight_transfer.py - -- label: 2 Node Tests (4 GPUs in total) # 16min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - num_nodes: 2 - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - - tests/examples/offline_inference/data_parallel.py - - .buildkite/scripts/run-multi-node-test.sh - commands: - - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - -- label: Distributed Tests (2 GPUs) # 68min - timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/compilation/ - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/worker/worker_base.py - - vllm/v1/engine/ - - vllm/v1/worker/ - - tests/compile/fullgraph/test_basic_correctness.py - - tests/compile/test_wrapper.py - - tests/distributed/ - - tests/entrypoints/llm/test_collective_rpc.py - - tests/v1/distributed - - tests/v1/entrypoints/openai/test_multi_api_servers.py - - tests/v1/shutdown - - tests/v1/worker/test_worker_memory_snapshot.py - commands: - # https://github.com/NVIDIA/nccl/issues/1838 - - export NCCL_CUMEM_HOST_ENABLE=0 - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - - pytest -v -s ./compile/test_wrapper.py - - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - pytest -v -s distributed/test_sequence_parallel.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - - pytest -v -s v1/worker/test_worker_memory_snapshot.py - -- label: Distributed Model Tests (2 GPUs) # 37min - timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/model_executor/model_loader/sharded_state_loader.py - - vllm/model_executor/models/ - - tests/basic_correctness/ - - tests/model_executor/model_loader/test_sharded_state_loader.py - - tests/models/ - commands: - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py - # Avoid importing model tests that cause CUDA reinitialization error - - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/language -v -s -m 'distributed(num_gpus=2)' - - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py - - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' - -- label: Plugin Tests (2 GPUs) # 40min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/plugins/ - - tests/plugins/ - commands: - # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform - - pip install -e ./plugins/vllm_add_dummy_platform - - pytest -v -s plugins_tests/test_platform_plugins.py - - pip uninstall vllm_add_dummy_platform -y - # end platform plugin tests - # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin - - pip install -e ./plugins/prithvi_io_processor_plugin - - pytest -v -s plugins_tests/test_io_processor_plugins.py - - pip uninstall prithvi_io_processor_plugin -y - # end io_processor plugins test - # begin stat_logger plugins test - - pip install -e ./plugins/vllm_add_dummy_stat_logger - - pytest -v -s plugins_tests/test_stats_logger_plugins.py - - pip uninstall dummy_stat_logger -y - # end stat_logger plugins test - # other tests continue here: - - pytest -v -s plugins_tests/test_scheduler_plugins.py - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s distributed/test_distributed_oot.py - - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - - pytest -v -s models/test_oot_registration.py # it needs a clean process - - pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins - -- label: Pipeline + Context Parallelism Test # 45min - timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - commands: - - pytest -v -s distributed/test_pp_cudagraph.py - - pytest -v -s distributed/test_pipeline_parallel.py - -- label: LoRA TP Test (Distributed) # 17 min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] - num_gpus: 4 - source_file_dependencies: - - vllm/lora - - tests/lora - commands: - # FIXIT: find out which code initialize cuda before running the test - # before the fix, we need to use spawn to test it - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - # Alot of these tests are on the edge of OOMing - - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True - # There is some Tensor Parallelism related processing logic in LoRA that - # requires multi-GPU testing for validation. - - pytest -v -s -x lora/test_chatglm3_tp.py - - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_llm_with_multi_loras.py - - pytest -v -s -x lora/test_olmoe_tp.py - - pytest -v -s -x lora/test_gptoss_tp.py - - -- label: Weight Loading Multiple GPU Test # 33min - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - optional: true - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - -- label: Weight Loading Multiple GPU Test - Large Models # optional - mirror_hardwares: [amdexperimental] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - gpu: a100 - optional: true - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - -- label: NixlConnector PD accuracy tests (Distributed) # 40min - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - -- label: DP EP NixlConnector PD accuracy tests (Distributed) # 15min - timeout_in_minutes: 15 - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py - - tests/v1/kv_connector/nixl_integration/ - commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - - -##### multi gpus test ##### -##### A100 test ##### - -- label: Distributed Tests (A100) # optional - gpu: a100 - optional: true - num_gpus: 4 - source_file_dependencies: - - vllm/ - commands: - # NOTE: don't test llama model here, it seems hf implementation is buggy - # see https://github.com/vllm-project/vllm/pull/5689 for details - - pytest -v -s distributed/test_custom_all_reduce.py - - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - - pytest -v -s -x lora/test_mixtral.py - -- label: Acceptance Length Test (Large Models) # optional - timeout_in_minutes: 120 - gpu: h100 - optional: true - num_gpus: 1 - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/v1/spec_decode/ - - vllm/model_executor/models/mlp_speculator.py - - tests/v1/spec_decode/test_acceptance_length.py - commands: - - export VLLM_ALLOW_INSECURE_SERIALIZATION=1 - - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test - -- label: LM Eval Large Models # optional - gpu: a100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - -##### H100 test ##### -- label: LM Eval Large Models (H100) # optional - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - -- label: Sequence Parallel Tests (H100) # 60 min - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - gpu: h100 - optional: true - num_gpus: 2 - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - # Run sequence parallel tests - - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py - - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py - -- label: Distributed Tests (H100) # optional - gpu: h100 - optional: true - working_dir: "/vllm-workspace/" - num_gpus: 2 - commands: - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py - - pytest -v -s tests/distributed/test_context_parallel.py - - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - - pytest -v -s tests/v1/distributed/test_dbo.py - -##### H200 test ##### - -- label: LM Eval Large Models (H200) # optional - timeout_in_minutes: 60 - gpu: h200 - optional: true - num_gpus: 8 - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt - -##### B200 test ##### -- label: Distributed Tests (B200) # optional - gpu: b200 - optional: true - working_dir: "/vllm-workspace/" - num_gpus: 2 - commands: - - pytest -v -s tests/distributed/test_context_parallel.py - - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - - pytest -v -s tests/v1/distributed/test_dbo.py - -##### RL Integration Tests ##### -- label: Prime-RL Integration Test # 15min - timeout_in_minutes: 30 - optional: true - soft_fail: true - num_gpus: 2 - working_dir: "/vllm-workspace" - source_file_dependencies: - - vllm/ - - .buildkite/scripts/run-prime-rl-test.sh - commands: - - nvidia-smi - - bash .buildkite/scripts/run-prime-rl-test.sh - -- label: DeepSeek V2-Lite Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - -- label: Qwen3-30B-A3B-FP8-block Accuracy (H100) - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - -- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) - timeout_in_minutes: 60 - gpu: b200 - optional: true - num_gpus: 2 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - -##### MoE Refactor (Temporary) Tests ##### - -- label: MoE Refactor Integration Test (H100 - TEMPORARY) # optional - gpu: h100 - optional: true - num_gpus: 2 - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt - -- label: MoE Refactor Integration Test (B200 - TEMPORARY) # optional - gpu: b200 - optional: true - num_gpus: 2 - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt - -- label: MoE Refactor Integration Test (B200 DP - TEMPORARY) # optional - gpu: b200 - optional: true - num_gpus: 2 - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt +# If you need to make changes to CI, please find the relevant file in these directories and make changes there. diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml index 574b642d407b0d131ad0bece4c9eb8b1a0dcca86..a30ec60ea9602fe39012509b14673b00c7a81bea 100644 --- a/.buildkite/test_areas/benchmarks.yaml +++ b/.buildkite/test_areas/benchmarks.yaml @@ -17,3 +17,15 @@ steps: - tests/benchmarks/ commands: - pytest -v -s benchmarks/ + +- label: Attention Benchmarks Smoke Test (B200) + device: b200 + num_gpus: 2 + optional: true + working_dir: "/vllm-workspace/" + timeout_in_minutes: 10 + source_file_dependencies: + - benchmarks/attention_benchmarks/ + - vllm/v1/attention/ + commands: + - python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1 diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml index 56fc011c77833a86f336162567d58d583aefcf75..5da7b64ac304adac2256013b6ca1567b6edd71d3 100644 --- a/.buildkite/test_areas/compile.yaml +++ b/.buildkite/test_areas/compile.yaml @@ -36,6 +36,16 @@ steps: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py +- label: AsyncTP Correctness Tests (B200) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/" + device: b200 + optional: true + num_devices: 2 + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py + - label: Distributed Compile Unit Tests (2xH100) timeout_in_minutes: 20 working_dir: "/vllm-workspace/" @@ -91,8 +101,8 @@ steps: - nvidia-smi # Run all models and attn backends but only Inductor partition and native custom ops - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" - # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" + # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)" - label: Fusion E2E Config Sweep (H100) timeout_in_minutes: 30 @@ -121,13 +131,10 @@ steps: optional: true commands: - nvidia-smi - # Run all models and attn backends but only Inductor partition and native custom ops - # -k "inductor_partition and not +rms_norm and not +quant_fp8" - # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported - # -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" - # Run just llama3 (fp8 & fp4) for all config combinations - # -k "llama-3" - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3" + # Run all models but only FLASHINFER, Inductor partition and native custom ops + # Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported + # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition) + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)" - label: Fusion E2E TP2 Quick (H100) timeout_in_minutes: 20 @@ -143,8 +150,8 @@ steps: commands: - nvidia-smi # Run all models and attn backends but only Inductor partition and native custom ops - - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" - - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))" + - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))" - label: Fusion E2E TP2 AR-RMS Config Sweep (H100) timeout_in_minutes: 40 @@ -162,7 +169,7 @@ steps: - tests/compile/fusions_e2e/ commands: - nvidia-smi - # Run just llama3 (fp4 & fp8 & bf16) for all config combinations + # Run just llama3 (fp8 & bf16) for all config combinations - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3" - label: Fusion E2E TP2 AsyncTP Config Sweep (H100) @@ -197,7 +204,8 @@ steps: - tests/compile/fusions_e2e/ commands: - nvidia-smi - # Run all models and attn backends but only Inductor partition and native custom ops + # Run all models but only FLASHINFER, Inductor partition and native custom ops + # include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported # for ar-rms-quant-fp4, also sweep llama3 - - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4" - - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4" + - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))" diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 4fac613c3515e26a724b91a44a3c358485dae866..f94f831a49e2824b41e41eced55ca278d95982eb 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -50,23 +50,18 @@ steps: - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py -- label: Distributed Tests (4 GPUs) - timeout_in_minutes: 50 +- label: Distributed Torchrun + Examples (4 GPUs) + timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - vllm/distributed/ - - tests/distributed/test_utils - - tests/distributed/test_pynccl - - tests/distributed/test_events - - tests/compile/fullgraph/test_basic_correctness.py + - tests/distributed/test_torchrun_example.py + - tests/distributed/test_torchrun_example_moe.py - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - examples/offline_inference/new_weight_syncing/ - tests/examples/offline_inference/data_parallel.py - - tests/v1/distributed - - tests/v1/engine/test_engine_core_client.py - - tests/distributed/test_symm_mem_allreduce.py commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 @@ -84,6 +79,27 @@ steps: - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + # OLD rlhf examples + - cd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + # NEW rlhf examples + - cd new_weight_syncing + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py + +- label: Distributed DP Tests (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_utils + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py @@ -91,20 +107,27 @@ steps: - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py + +- label: Distributed Compile + Comm (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/fullgraph/test_basic_correctness.py + - tests/distributed/test_symm_mem_allreduce.py + - tests/distributed/test_multiproc_executor.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_symm_mem_allreduce.py - # TODO: create a dedicated test section for multi-GPU example tests - # when we have multiple distributed example tests - # OLD rlhf examples - - cd ../examples/offline_inference - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - # NEW rlhf examples - - cd new_weight_syncing - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_async_new_apis.py + # test multi-node TP with multiproc executor (simulated on single node) + - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node - label: Distributed Tests (8 GPUs)(H100) timeout_in_minutes: 10 @@ -146,6 +169,7 @@ steps: num_devices: 2 commands: - pytest -v -s tests/distributed/test_context_parallel.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py @@ -165,6 +189,7 @@ steps: num_devices: 2 num_nodes: 2 no_plugin: true + optional: true # TODO: revert once infra issue solved source_file_dependencies: - vllm/distributed/ - vllm/engine/ @@ -197,7 +222,31 @@ steps: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh -- label: Pipeline + Context Parallelism (4 GPUs)) +- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh + +- label: NixlConnector PD + Spec Decode acceptance (2 GPUs) + timeout_in_minutes: 30 + device: a100 + working_dir: "/vllm-workspace/tests" + num_devices: 2 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/v1/worker/kv_connector_model_runner_mixin.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh + +- label: Pipeline + Context Parallelism (4 GPUs) timeout_in_minutes: 60 working_dir: "/vllm-workspace/tests" num_devices: 4 diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml index 958bff5c95bb0ebc3233605295a6a139b1fb640a..5b7f96bc7a26cd4593a40b39da2657234c4c9ce3 100644 --- a/.buildkite/test_areas/e2e_integration.yaml +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -29,15 +29,11 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 -- label: Prime-RL Integration (2 GPUs) - timeout_in_minutes: 30 +- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100) + timeout_in_minutes: 60 + device: h100 optional: true - soft_fail: true - num_devices: 2 + num_devices: 1 working_dir: "/vllm-workspace" - source_file_dependencies: - - vllm/ - - .buildkite/scripts/run-prime-rl-test.sh commands: - - nvidia-smi - - bash .buildkite/scripts/run-prime-rl-test.sh + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030 diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml index 82ce2f420053728896bd77619f700d8c425c4df4..be83bab8fa29b7daa37887d09cd039550607c16e 100644 --- a/.buildkite/test_areas/engine.yaml +++ b/.buildkite/test_areas/engine.yaml @@ -1,5 +1,5 @@ group: Engine -depends_on: +depends_on: - image-build steps: - label: Engine @@ -14,17 +14,59 @@ steps: commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py -- label: V1 e2e + engine - timeout_in_minutes: 45 +- label: Engine (1 GPU) + timeout_in_minutes: 30 source_file_dependencies: - - vllm/ - - tests/v1 + - vllm/v1/engine/ + - tests/v1/engine/ commands: - # TODO: accuracy does not match, whether setting - # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - - pytest -v -s v1/e2e - # Run this test standalone for now; - # need to untangle use (implicit) use of spawn/fork across the tests. - pytest -v -s v1/engine/test_preprocess_error_handling.py - # Run the rest of v1/engine tests - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py + +- label: e2e Scheduling (1 GPU) + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/v1/ + - tests/v1/e2e/general/ + commands: + - pytest -v -s v1/e2e/general/test_async_scheduling.py + +- label: e2e Core (1 GPU) + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/v1/ + - tests/v1/e2e/general/ + commands: + - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py + +- label: V1 e2e (2 GPUs) + timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability + optional: true + num_devices: 2 + source_file_dependencies: + - vllm/ + - tests/v1/e2e + commands: + # Only run tests that need exactly 2 GPUs + - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" + mirror: + amd: + device: mi325_2 + depends_on: + - image-build-amd + +- label: V1 e2e (4 GPUs) + timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability + optional: true + num_devices: 4 + source_file_dependencies: + - vllm/ + - tests/v1/e2e + commands: + # Only run tests that need 4 GPUs + - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" + mirror: + amd: + device: mi325_4 + depends_on: + - image-build-amd diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 8e02d9f60b4e9cbee9481b697a6c017a0f1e32d9..9de9c3fd2ddae3bfa2d34a1e679b39346d12979f 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -34,23 +34,26 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses - pytest -v -s entrypoints/test_chat_utils.py + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: Entrypoints Integration (API Server 2) timeout_in_minutes: 130 working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - - tests/tool_use - - tests/entrypoints/sleep - - tests/entrypoints/instrumentator - tests/entrypoints/rpc + - tests/entrypoints/instrumentator + - tests/tool_use commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s entrypoints/instrumentator - - pytest -v -s entrypoints/sleep + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use - label: Entrypoints Integration (Pooling) @@ -79,6 +82,11 @@ steps: - tests/v1 commands: - pytest -v -s v1/entrypoints + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: OpenAI API Correctness timeout_in_minutes: 30 diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml index 9a10476ed78a6b624b82c6a30a827c5d535038ee..1443d847eaf505f1c700e99e61a58758f2b3d17f 100644 --- a/.buildkite/test_areas/expert_parallelism.yaml +++ b/.buildkite/test_areas/expert_parallelism.yaml @@ -20,4 +20,19 @@ steps: - tests/distributed/test_eplb_execute.py commands: - pytest -v -s distributed/test_eplb_execute.py - - pytest -v -s distributed/test_eplb_spec_decode.py \ No newline at end of file + - pytest -v -s distributed/test_eplb_spec_decode.py + +- label: Elastic EP Scaling Test + timeout_in_minutes: 20 + device: b200 + optional: true + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/compilation/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_elastic_ep.py diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index 3f43b8d429a96e5a425a65de0c62cd921698c789..e0be49cf39c37eeac5634daa706aba1bf5daf15e 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -8,8 +8,9 @@ steps: - csrc/ - tests/kernels/core - tests/kernels/test_top_k_per_row.py + - tests/kernels/test_concat_mla_q.py commands: - - pytest -v -s kernels/core kernels/test_top_k_per_row.py + - pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py - label: Kernels Attention Test %N timeout_in_minutes: 35 @@ -44,7 +45,8 @@ steps: - vllm/envs.py - vllm/config commands: - - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 - label: Kernels Mamba Test @@ -70,7 +72,7 @@ steps: - tests/kernels/moe/test_batched_deepgemm.py - tests/kernels/attention/test_deepgemm_attention.py commands: - - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/quantization/test_block_fp8.py - pytest -v -s kernels/moe/test_deepgemm.py - pytest -v -s kernels/moe/test_batched_deepgemm.py - pytest -v -s kernels/attention/test_deepgemm_attention.py @@ -95,7 +97,7 @@ steps: - vllm/platforms/cuda.py commands: - nvidia-smi - - python3 examples/offline_inference/basic/chat.py + - python3 examples/basic/offline_inference/chat.py # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - pytest -v -s tests/kernels/attention/test_attention_selector.py @@ -115,6 +117,7 @@ steps: - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/kernels/moe/test_flashinfer_moe.py - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py # e2e - pytest -v -s tests/models/quantization/test_nvfp4.py @@ -154,9 +157,7 @@ steps: commands: - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py - pytest -v -s kernels/moe/test_deepep_moe.py - - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py - # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main - + - label: Kernels Fp4 MoE Test (B200) timeout_in_minutes: 60 device: b200 diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml index 1ef29f36cec0bef6c66a2b697a23648455175dc3..3e2610e70a312b624f01a7f930bc56d7cdfe2587 100644 --- a/.buildkite/test_areas/lm_eval.yaml +++ b/.buildkite/test_areas/lm_eval.yaml @@ -11,17 +11,17 @@ steps: commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt -- label: LM Eval Large Models (4 GPUs)(A100) - device: a100 - optional: true - num_devices: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 +# - label: LM Eval Large Models (4 GPUs)(A100) +# device: a100 +# optional: true +# num_devices: 4 +# working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" +# source_file_dependencies: +# - csrc/ +# - vllm/model_executor/layers/quantization +# commands: +# - export VLLM_WORKER_MULTIPROC_METHOD=spawn +# - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - label: LM Eval Large Models (4 GPUs)(H100) device: h100 @@ -73,3 +73,29 @@ steps: num_devices: 2 commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt + +- label: GPQA Eval (GPT-OSS) (H100) + timeout_in_minutes: 120 + device: h100 + optional: true + num_devices: 2 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/evals/gpt_oss/ + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt + +- label: GPQA Eval (GPT-OSS) (B200) + timeout_in_minutes: 120 + device: b200 + optional: true + num_devices: 2 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/evals/gpt_oss/ + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index 1e931879672b7ef356f124ce82a183765b8ecaca..9280696d13b7c9dcac479d231e4b4295512bb8b2 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -9,6 +9,7 @@ steps: - tests/v1 commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - export VLLM_WORKER_MULTIPROC_METHOD=spawn # split the test to avoid interference - pytest -v -s -m 'not cpu_test' v1/core - pytest -v -s v1/executor @@ -16,6 +17,7 @@ steps: - pytest -v -s v1/sample - pytest -v -s v1/logits_processors - pytest -v -s v1/worker + # TODO: create another `optional` test group for slow tests - pytest -v -s -m 'not slow_test' v1/spec_decode - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - pytest -v -s -m 'not cpu_test' v1/metrics @@ -25,6 +27,11 @@ steps: # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: V1 Others (CPU) depends_on: @@ -60,12 +67,13 @@ steps: - examples/ commands: - pip install tensorizer # for tensorizer test - - python3 offline_inference/basic/chat.py # for basic - - python3 offline_inference/basic/generate.py --model facebook/opt-125m - - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py + # for basic + - python3 basic/offline_inference/chat.py + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 basic/offline_inference/classify.py + - python3 basic/offline_inference/embed.py + - python3 basic/offline_inference/score.py # for multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 @@ -108,9 +116,11 @@ steps: timeout_in_minutes: 50 source_file_dependencies: - vllm/ + - tests/detokenizer - tests/multimodal - tests/utils_ commands: + - pytest -v -s detokenizer - pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s utils_ @@ -123,6 +133,7 @@ steps: - tests/test_inputs.py - tests/test_outputs.py - tests/test_pooling_params.py + - tests/test_ray_env.py - tests/multimodal - tests/renderers - tests/standalone_tests/lazy_imports.py @@ -136,6 +147,7 @@ steps: - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - pytest -v -s test_pooling_params.py + - pytest -v -s test_ray_env.py - pytest -v -s -m 'cpu_test' multimodal - pytest -v -s renderers - pytest -v -s tokenizers_ @@ -143,20 +155,6 @@ steps: - pytest -v -s transformers_utils - pytest -v -s config -- label: GPT-OSS Eval (B200) - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - device: b200 - optional: true - source_file_dependencies: - - tests/evals/gpt_oss - - vllm/model_executor/models/gpt_oss.py - - vllm/model_executor/layers/quantization/mxfp4.py - - vllm/v1/attention/backends/flashinfer.py - commands: - - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - - label: Batch Invariance (H100) timeout_in_minutes: 25 device: h100 diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..85421399d1b8d96ef6dd3107d493d4283e82cf22 --- /dev/null +++ b/.buildkite/test_areas/model_runner_v2.yaml @@ -0,0 +1,110 @@ +group: Model Runner V2 +depends_on: + - image-build +steps: +- label: Model Runner V2 Core Tests + timeout_in_minutes: 45 + source_file_dependencies: + - vllm/v1/worker/gpu/ + - vllm/v1/worker/gpu_worker.py + - vllm/v1/core/sched/ + - vllm/v1/attention/ + - tests/v1/engine/test_llm_engine.py + - tests/v1/e2e/ + - tests/v1/entrypoints/llm/test_struct_output_generate.py + commands: + - set -x + - export VLLM_USE_V2_MODEL_RUNNER=1 + - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics" + # This requires eager until we sort out CG correctness issues. + # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged. + - ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram" + - pytest -v -s v1/e2e/general/test_context_length.py + - pytest -v -s v1/e2e/general/test_min_tokens.py + # Temporary hack filter to exclude ngram spec decoding based tests. + - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0" + +- label: Model Runner V2 Examples + timeout_in_minutes: 45 + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/v1/worker/gpu/ + - vllm/v1/core/sched/ + - vllm/v1/worker/gpu_worker.py + - examples/offline_inference/ + - examples/basic/offline_inference/ + - examples/pooling/embed/vision_embedding_offline.py + - examples/others/tensorize_vllm_model.py + commands: + - set -x + - export VLLM_USE_V2_MODEL_RUNNER=1 + - pip install tensorizer # for tensorizer test + - python3 basic/offline_inference/chat.py # for basic + - python3 basic/offline_inference/generate.py --model facebook/opt-125m + #- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 # TODO + #- python3 basic/offline_inference/embed.py # TODO + # for multi-modal models + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + # for pooling models + - python3 pooling/embed/vision_embedding_offline.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + +- label: Model Runner V2 Distributed (2 GPUs) + timeout_in_minutes: 45 + working_dir: "/vllm-workspace/tests" + num_devices: 2 + source_file_dependencies: + - vllm/v1/worker/gpu/ + - vllm/v1/worker/gpu_worker.py + - tests/basic_correctness/test_basic_correctness.py + - tests/v1/distributed/test_async_llm_dp.py + - tests/v1/distributed/test_eagle_dp.py + commands: + - set -x + - export VLLM_USE_V2_MODEL_RUNNER=1 + # The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported. + - TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True" + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray" + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + +# These require fix https://github.com/vllm-project/vllm/pull/36280 +- label: Model Runner V2 Pipeline Parallelism (4 GPUs) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/tests" + num_devices: 4 + source_file_dependencies: + - vllm/v1/worker/gpu/ + - vllm/v1/worker/gpu_worker.py + - tests/distributed/test_pipeline_parallel.py + #- tests/distributed/test_pp_cudagraph.py + commands: + - set -x + - export VLLM_USE_V2_MODEL_RUNNER=1 + - pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba" + # TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged. + #- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray" + +- label: Model Runner V2 Spec Decode + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/worker/gpu/ + - vllm/v1/worker/gpu_worker.py + - tests/v1/spec_decode/test_max_len.py + - tests/v1/e2e/spec_decode/test_spec_decode.py + commands: + - set -x + - export VLLM_USE_V2_MODEL_RUNNER=1 + - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp" + - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp" diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml index df0a98dc9c2cc2266cda974a4256456b983bfe20..c1cc9e9a36e09d34e8de86085afc041044eeac81 100644 --- a/.buildkite/test_areas/models_basic.yaml +++ b/.buildkite/test_areas/models_basic.yaml @@ -4,7 +4,6 @@ depends_on: steps: - label: Basic Models Tests (Initialization) timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ @@ -16,7 +15,6 @@ steps: - label: Basic Models Tests (Extra Initialization) %N timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/model_executor/models/ @@ -38,6 +36,12 @@ steps: - tests/models/test_registry.py commands: - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd + - label: Basic Models Test (Other CPU) # 5min depends_on: @@ -61,7 +65,7 @@ steps: - pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py - - python3 examples/offline_inference/basic/chat.py + - python3 examples/basic/offline_inference/chat.py - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml index f70192c4ebc0ab7f50a77b547951eeb9bebbfa2a..a3bd21ccff3cd58a4a60499e8b6ff058c0281adc 100644 --- a/.buildkite/test_areas/models_language.yaml +++ b/.buildkite/test_areas/models_language.yaml @@ -4,7 +4,6 @@ depends_on: steps: - label: Language Models Tests (Standard) timeout_in_minutes: 25 - mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ @@ -16,7 +15,6 @@ steps: - label: Language Models Tests (Extra Standard) %N timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/model_executor/models/ @@ -32,7 +30,6 @@ steps: - label: Language Models Tests (Hybrid) %N timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ @@ -40,7 +37,7 @@ steps: commands: # Install fast path packages for testing against transformers # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' # Shard hybrid language model tests - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB @@ -48,7 +45,6 @@ steps: - label: Language Models Test (Extended Generation) # 80min timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ @@ -56,13 +52,21 @@ steps: commands: # Install fast path packages for testing against transformers # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - label: Language Models Test (PPL) timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ @@ -72,17 +76,20 @@ steps: - label: Language Models Test (Extended Pooling) # 36min timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ - tests/models/language/pooling commands: - pytest -v -s models/language/pooling -m 'not core_model' + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: Language Models Test (MTEB) timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml index 4d05fb2af028dd4cc934c85c644e2767a14a51d4..eb10bf6c71c231eb3d051373b4be198ec7594b08 100644 --- a/.buildkite/test_areas/models_multimodal.yaml +++ b/.buildkite/test_areas/models_multimodal.yaml @@ -2,16 +2,65 @@ group: Models - Multimodal depends_on: - image-build steps: -- label: Multi-Modal Models (Standard) # 60min - timeout_in_minutes: 80 +- label: "Multi-Modal Models (Standard) 1: qwen2" + timeout_in_minutes: 45 source_file_dependencies: - vllm/ - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2" + - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd + +- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" + timeout_in_minutes: 45 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma" + - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd + +- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" + timeout_in_minutes: 45 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma" + - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd + +- label: "Multi-Modal Models (Standard) 4: other + whisper" + timeout_in_minutes: 45 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: Multi-Modal Processor Test (CPU) depends_on: @@ -20,6 +69,7 @@ steps: source_file_dependencies: - vllm/ - tests/models/multimodal + - tests/models/registry.py device: cpu commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git @@ -30,6 +80,7 @@ steps: source_file_dependencies: - vllm/ - tests/models/multimodal + - tests/models/registry.py commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/processing/test_tensor_schema.py @@ -52,6 +103,11 @@ steps: commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: Multi-Modal Models (Extended) 2 optional: true @@ -70,12 +126,3 @@ steps: commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - -# This test is used only in PR development phase to test individual models and should never run on main -- label: Custom Models - optional: true - commands: - - echo 'Testing custom models...' - # PR authors can temporarily add commands below to test individual models - # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py - # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml index ccc54b47abd4599308271f39afef6dadc977d033..7e7727fce7df4f0aeb167d5abef5fcb9b7b3128c 100644 --- a/.buildkite/test_areas/plugins.yaml +++ b/.buildkite/test_areas/plugins.yaml @@ -15,10 +15,17 @@ steps: - pytest -v -s plugins_tests/test_platform_plugins.py - pip uninstall vllm_add_dummy_platform -y # end platform plugin tests - # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + # begin io_processor plugins test + # test generic io_processor plugins functions + - pytest -v -s ./plugins_tests/test_io_processor_plugins.py + # test Terratorch io_processor plugins - pip install -e ./plugins/prithvi_io_processor_plugin - - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pytest -v -s plugins_tests/test_terratorch_io_processor_plugins.py - pip uninstall prithvi_io_processor_plugin -y + # test bge_m3_sparse io_processor plugin + - pip install -e ./plugins/bge_m3_sparse_plugin + - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py + - pip uninstall bge_m3_sparse_plugin -y # end io_processor plugins test # begin stat_logger plugins test - pip install -e ./plugins/vllm_add_dummy_stat_logger diff --git a/.buildkite/test_areas/ray_compat.yaml b/.buildkite/test_areas/ray_compat.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7917b0a4ff8b984120dcb55a83b54b44406f4739 --- /dev/null +++ b/.buildkite/test_areas/ray_compat.yaml @@ -0,0 +1,16 @@ +group: Ray Compatibility +depends_on: + - image-build +steps: +- label: Ray Dependency Compatibility Check + # Informational only — does not block the pipeline. + # If this fails, it means the PR introduces a dependency that + # conflicts with Ray's dependency constraints. + # See https://github.com/vllm-project/vllm/issues/33599 + soft_fail: true + timeout_in_minutes: 10 + source_file_dependencies: + - requirements/ + - setup.py + commands: + - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml index ad377148fd07322bc1f259db3e9e6c8f8ab3c087..2052a379827ab624b6e8576cffe32635d7012d07 100644 --- a/.buildkite/test_areas/samplers.yaml +++ b/.buildkite/test_areas/samplers.yaml @@ -12,3 +12,10 @@ steps: commands: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd + commands: + - pytest -v -s samplers diff --git a/.buildkite/test_areas/spec_decode.yaml b/.buildkite/test_areas/spec_decode.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8dba7a2f8c6644a6a4198da34a7b8c05ee83baed --- /dev/null +++ b/.buildkite/test_areas/spec_decode.yaml @@ -0,0 +1,40 @@ +group: Spec Decode +depends_on: + - image-build +steps: +- label: Spec Decode Eagle + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - tests/v1/e2e/spec_decode/ + commands: + - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness" + +- label: Spec Decode Speculators + MTP + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/transformers_utils/configs/speculators/ + - tests/v1/e2e/spec_decode/ + commands: + - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness" + +- label: Spec Decode Ngram + Suffix + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - tests/v1/e2e/spec_decode/ + commands: + - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix" + +- label: Spec Decode Draft Model + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - tests/v1/e2e/spec_decode/ + commands: + - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference" diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml index 3561d57076bac3790c604bd2660ee6e917116aa3..8e86374a8ad02efefe35366cb7e421b809c4d264 100644 --- a/.buildkite/test_areas/weight_loading.yaml +++ b/.buildkite/test_areas/weight_loading.yaml @@ -13,13 +13,13 @@ steps: commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt -- label: Weight Loading Multiple GPU - Large Models # optional - working_dir: "/vllm-workspace/tests" - num_devices: 2 - device: a100 - optional: true - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt +# - label: Weight Loading Multiple GPU - Large Models # optional +# working_dir: "/vllm-workspace/tests" +# num_devices: 2 +# device: a100 +# optional: true +# source_file_dependencies: +# - vllm/ +# - tests/weight_loading +# commands: +# - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt diff --git a/.github/.bc-linter.yml b/.github/.bc-linter.yml deleted file mode 100644 index 443dfa45af22c16ee3619b76caa0a910735a657c..0000000000000000000000000000000000000000 --- a/.github/.bc-linter.yml +++ /dev/null @@ -1,24 +0,0 @@ -# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md -version: 1 -paths: -# We temporarily disable globally, and will only enable with `annotations.include` -# include: -# - "vllm/v1/attetion/*.py" -# - "vllm/v1/core/*.py" -exclude: - - "**/*.py" - -scan: - functions: true # check free functions and methods - classes: true # check classes/dataclasses - public_only: true # ignore names starting with "_" at any level - -annotations: - include: # decorators that force‑include a symbol - - name: "bc_linter_include" # matched by simple name or dotted suffix - propagate_to_members: false # for classes, include methods/inner classes - exclude: # decorators that force‑exclude a symbol - - name: "bc_linter_skip" # matched by simple name or dotted suffix - propagate_to_members: true # for classes, exclude methods/inner classes - -excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"] diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2e7930785483c3a69b28a90071dfb93b6b40c956..653d6c42e9af1ced5da2640cce27603ef2243fa4 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,45 +2,66 @@ # for more info about CODEOWNERS file # This lists cover the "core" components of vLLM that require careful review -/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn -/vllm/model_executor/layers/attention @LucasWilkinson +/vllm/compilation @zou3519 @youkaichao @ProExpertProg @BoyuanFeng +/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery +/vllm/lora @jeejeelee +/vllm/model_executor/layers/attention @LucasWilkinson @MatthewBonanni /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/model_loader @22quinn /vllm/model_executor/layers/batch_invariant.py @yewentao256 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa -/vllm/vllm_flash_attn @LucasWilkinson -/vllm/lora @jeejeelee -/vllm/reasoning @aarnphm @chaunceyjiang -/vllm/entrypoints @aarnphm @chaunceyjiang -/vllm/tool_parsers @aarnphm @chaunceyjiang -/vllm/compilation @zou3519 @youkaichao @ProExpertProg -/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery +/vllm/vllm_flash_attn @LucasWilkinson @MatthewBonanni CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Any change to the VllmConfig changes can have a large user-facing impact, # so spam a lot of people /vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg -/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 +/vllm/config/cache.py @heheda12345 + +# Entrypoints +/vllm/entrypoints/anthropic @mgoin @DarkLight1337 +/vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb +/vllm/entrypoints/mcp @heheda12345 +/vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb +/vllm/entrypoints/openai/realtime @njhill +/vllm/entrypoints/openai/speech_to_text @NickLucche +/vllm/entrypoints/pooling @noooop +/vllm/entrypoints/sagemaker @DarkLight1337 +/vllm/entrypoints/serve @njhill +/vllm/entrypoints/*.py @njhill +/vllm/entrypoints/chat_utils.py @DarkLight1337 +/vllm/entrypoints/llm.py @DarkLight1337 + +# Input/Output Processing +/vllm/sampling_params.py @njhill @NickLucche +/vllm/pooling_params.py @noooop @DarkLight1337 +/vllm/tokenizers @DarkLight1337 @njhill +/vllm/renderers @DarkLight1337 @njhill +/vllm/reasoning @aarnphm @chaunceyjiang +/vllm/tool_parsers @aarnphm @chaunceyjiang # vLLM V1 -/vllm/v1/attention @LucasWilkinson +/vllm/v1/attention @LucasWilkinson @MatthewBonanni /vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill /vllm/v1/attention/backends/mla @pavanimajety /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety /vllm/v1/attention/backends/triton_attn.py @tdoublep /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery /vllm/v1/sample @22quinn @houseroad @njhill -/vllm/v1/spec_decode @benchislett @luccafong +/vllm/v1/spec_decode @benchislett @luccafong @MatthewBonanni /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett /vllm/v1/kv_cache_interface.py @heheda12345 /vllm/v1/kv_offload @ApostaC @orozery -/vllm/v1/worker/gpu/kv_connector.py @orozery -/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery +/vllm/v1/engine @njhill +/vllm/v1/executor @njhill +/vllm/v1/worker @njhill +/vllm/v1/worker/kv_connector_model_runner_mixin.py @orozery @NickLucche # Model runner V2 -/vllm/v1/worker/gpu @WoosukKwon +/vllm/v1/worker/gpu @WoosukKwon @njhill +/vllm/v1/worker/gpu/kv_connector.py @orozery # Test ownership /.buildkite/lm-eval-harness @mgoin @@ -115,8 +136,8 @@ mkdocs.yaml @hmellor /vllm/model_executor/models/mixtral*.py @patrickvonplaten /vllm/model_executor/models/voxtral*.py @patrickvonplaten /vllm/model_executor/models/pixtral*.py @patrickvonplaten +/vllm/tokenizers/mistral.py @patrickvonplaten /vllm/transformers_utils/configs/mistral.py @patrickvonplaten -/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten # Kernels /vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep @@ -152,9 +173,7 @@ mkdocs.yaml @hmellor /examples/pooling @noooop /tests/models/*/pooling* @noooop /tests/entrypoints/pooling @noooop -/vllm/entrypoints/pooling @noooop /vllm/config/pooler.py @noooop -/vllm/pooling_params.py @noooop /vllm/model_executor/layers/pooler @noooop # Security guide and policies diff --git a/.github/mergify.yml b/.github/mergify.yml index 080767ca7218ae5f4fdd9c60985a3453f69e6e1b..c6d1f1fed52daa6371d4cbc1a6aaed2a4f2e1c4f 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -3,6 +3,7 @@ pull_request_rules: description: Automatically apply documentation label conditions: - label != stale + - -closed - or: - files~=^[^/]+\.md$ - files~=^docs/ @@ -26,7 +27,7 @@ pull_request_rules: Hi @{{author}}, the pre-commit checks have failed. Please run: ```bash - uv pip install pre-commit + uv pip install pre-commit>=4.5.1 pre-commit install pre-commit run --all-files ``` @@ -37,15 +38,13 @@ pull_request_rules: > [!TIP] >
- > Is mypy or markdownlint failing? + > Is mypy failing? >
- > mypy and markdownlint are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally: + > mypy is run differently in CI. If the failure is related to this check, please use the following command to run it locally: > > ```bash > # For mypy (substitute "3.10" with the failing version if needed) > pre-commit run --hook-stage manual mypy-3.10 - > # For markdownlint - > pre-commit run --hook-stage manual markdownlint > ``` >
@@ -259,8 +258,7 @@ pull_request_rules: - files=benchmarks/run_structured_output_benchmark.sh - files=docs/features/structured_outputs.md - files=examples/offline_inference/structured_outputs.py - - files=examples/online_serving/openai_chat_completion_structured_outputs.py - - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py + - files=examples/online_serving/structured_outputs/structured_outputs.py - files~=^tests/v1/structured_output/ - files=tests/v1/entrypoints/llm/test_struct_output_generate.py - files~=^vllm/v1/structured_output/ @@ -336,7 +334,7 @@ pull_request_rules: - or: - files~=^tests/tool_use/ - files~=^tests/entrypoints/openai/tool_parsers/ - - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py + - files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py - files~=^vllm/entrypoints/openai/tool_parsers/ - files=docs/features/tool_calling.md - files~=^examples/tool_chat_* diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml deleted file mode 100644 index 823695a921321921115153f79c8dd0232d097330..0000000000000000000000000000000000000000 --- a/.github/workflows/bc-lint.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: BC Lint - -on: - pull_request: - types: - - opened - - synchronize - - reopened - - labeled - - unlabeled - -jobs: - bc_lint: - if: github.repository_owner == 'vllm-project' - runs-on: ubuntu-latest - steps: - - name: Run BC Lint Action - uses: pytorch/test-infra/.github/actions/bc-lint@main - with: - repo: ${{ github.event.pull_request.head.repo.full_name }} - base_sha: ${{ github.event.pull_request.base.sha }} - head_sha: ${{ github.event.pull_request.head.sha }} - suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }} - docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter' - config_dir: .github - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} - cancel-in-progress: true diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index df8910837715dcf11ab79809c6bbe8fcf459df1d..f1a91a7cd16f16829d71030d3b252b1726753bef 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -19,6 +19,7 @@ jobs: uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: '3.12' + cache: 'pip' - name: Install Python dependencies run: | diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml index 5af045882f3505ee3b4b22647740d8573cacb196..838ba1124dcd0c900183329a826e6e7d6cad7173 100644 --- a/.github/workflows/macos-smoke-test.yml +++ b/.github/workflows/macos-smoke-test.yml @@ -6,6 +6,9 @@ on: - main workflow_dispatch: # Manual trigger +permissions: + contents: read + jobs: macos-m1-smoke-test: runs-on: macos-latest diff --git a/.gitignore b/.gitignore index 375b1b7ebadfae9edec6fef0b564d405a9a12374..d62536cfb91d741f17c1da7b4dbbe1b1023fdccd 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ # vllm-flash-attn built from source vllm/vllm_flash_attn/* +!vllm/vllm_flash_attn/__init__.py +!vllm/vllm_flash_attn/flash_attn_interface.py # OpenAI triton kernels copied from source vllm/third_party/triton_kernels/* @@ -187,11 +189,9 @@ cython_debug/ .vscode/ # Claude -CLAUDE.md .claude/ # Codex -AGENTS.md .codex/ # Cursor @@ -238,3 +238,6 @@ ep_kernels_workspace/ vllm/grpc/vllm_engine_pb2.py vllm/grpc/vllm_engine_pb2_grpc.py vllm/grpc/vllm_engine_pb2.pyi + +# Ignore generated cpu headers +csrc/cpu/cpu_attn_dispatch_generated.h diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index db7321b9345e2e626c6e83dc8fbb883e66079121..0b17ad7335c7556ca9474a6137769e6588d2363e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ repos: args: [--output-format, github, --fix] - id: ruff-format - repo: https://github.com/crate-ci/typos - rev: v1.38.1 + rev: v1.43.5 hooks: - id: typos args: [--force-exclude] @@ -24,12 +24,13 @@ repos: exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' types_or: [c++, cuda] args: [--style=file, --verbose] -- repo: https://github.com/igorshubovych/markdownlint-cli - rev: v0.45.0 +- repo: https://github.com/DavidAnson/markdownlint-cli2 + rev: v0.21.0 hooks: - - id: markdownlint - exclude: '.*\.inc\.md' - stages: [manual] # Only run in CI + - id: markdownlint-cli2 + language_version: lts + args: [--fix] + exclude: ^CLAUDE\.md$ - repo: https://github.com/rhysd/actionlint rev: v1.7.7 hooks: @@ -55,7 +56,7 @@ repos: language: python types_or: [python, pyi] require_serial: true - additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic] + additional_dependencies: ["mypy[faster-cache]==1.19.1", regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic] - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.10 entry: python tools/pre_commit/mypy.py 1 "3.10" @@ -127,6 +128,13 @@ repos: language: python types: [python] additional_dependencies: [regex] + # prevent use torch.cuda APIs + - id: check-torch-cuda-call + name: "Prevent new 'torch.cuda' APIs call" + entry: python tools/pre_commit/check_torch_cuda.py + language: python + types: [python] + additional_dependencies: [regex] - id: validate-config name: Validate configuration has default values and that each field has a docstring entry: python tools/pre_commit/validate_config.py @@ -143,6 +151,11 @@ repos: name: Check attention backend documentation is up to date entry: python tools/pre_commit/generate_attention_backend_docs.py --check language: python + - id: check-boolean-context-manager + name: Check for boolean ops in with-statements + entry: python tools/pre_commit/check_boolean_context_manager.py + language: python + types: [python] # Keep `suggestion` last - id: suggestion name: Suggestion diff --git a/.readthedocs.yaml b/.readthedocs.yaml index d83d6df35ed9a0e9b0ef3d71d32caeeffeedb402..1e479fd03d9174bfb721934a61727816fa5c9e0c 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,13 +9,15 @@ build: python: "3.12" jobs: post_checkout: - - git fetch --unshallow || true + # - bash docs/maybe_skip_pr_build.sh + - git fetch origin main --unshallow --no-tags --filter=blob:none || true + pre_create_environment: + - pip install uv + create_environment: + - uv venv $READTHEDOCS_VIRTUALENV_PATH + install: + - uv pip install --python $READTHEDOCS_VIRTUALENV_PATH/bin/python --no-cache-dir -r requirements/docs.txt mkdocs: configuration: mkdocs.yaml fail_on_warning: true - -# Optionally declare the Python requirements required to build your docs -python: - install: - - requirements: requirements/docs.txt diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000000000000000000000000000000000000..c541a370b50ef0c456ce7b5477461c9f49257719 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,113 @@ +# Agent Instructions for vLLM + +> These instructions apply to **all** AI-assisted contributions to `vllm-project/vllm`. +> Breaching these guidelines can result in automatic banning. + +## 1. Contribution Policy (Mandatory) + +### Duplicate-work checks + +Before proposing a PR, run these checks: + +```bash +gh issue view --repo vllm-project/vllm --comments +gh pr list --repo vllm-project/vllm --state open --search " in:body" +gh pr list --repo vllm-project/vllm --state open --search "" +``` + +- If an open PR already addresses the same fix, do not open another. +- If your approach is materially different, explain the difference in the issue. + +### No low-value busywork PRs + +Do not open one-off PRs for tiny edits (single typo, isolated style change, one mutable default, etc.). Mechanical cleanups are acceptable only when bundled with substantive work. + +### Accountability + +- Pure code-agent PRs are **not allowed**. A human submitter must understand and defend the change end-to-end. +- The submitting human must review every changed line and run relevant tests. +- PR descriptions for AI-assisted work **must** include: + - Why this is not duplicating an existing PR. + - Test commands run and results. + - Clear statement that AI assistance was used. + +### Fail-closed behavior + +If work is duplicate/trivial busywork, **do not proceed**. Return a short explanation of what is missing. + +--- + +## 2. Development Workflow + +### Environment setup + +```bash +# Install `uv` if you don't have it already: +curl -LsSf https://astral.sh/uv/install.sh | sh + +# Always use `uv` for Python environment management: +uv venv --python 3.12 +source .venv/bin/activate + +# Always make sure `pre-commit` and its hooks are installed: +uv pip install -r requirements/lint.txt +pre-commit install +``` + +### Installing dependencies + +```bash +# If you are only making Python changes: +VLLM_USE_PRECOMPILED=1 uv pip install -e . + +# If you are also making C/C++ changes: +uv pip install -e . +``` + +### Running tests + +Tests require extra dependencies. +All versions for test dependencies should be read from `requirements/test.txt` + +```bash +# Install bare minimum test dependencies: +uv pip install pytest pytest-asyncio tblib + +# Install additional test dependencies as needed, or install them all as follows: +uv pip install -r requirements/test.txt + +# Run specific test from specific test file +pytest tests/path/to/test.py -v -s -k test_name + +# Run all tests in directory +pytest tests/path/to/dir -v -s +``` + +### Running linters + +```bash +# Run all pre-commit hooks on staged files: +pre-commit run + +# Run on all files: +pre-commit run --all-files + +# Run a specific hook: +pre-commit run ruff-check --all-files + +# Run mypy as it is in CI: +pre-commit run mypy-3.10 --all-files --hook-stage manual +``` + +### Commit messages + +Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example: + +```text +Your commit message here + +Co-authored-by: GitHub Copilot +Co-authored-by: Claude +Co-authored-by: gemini-code-assist +Signed-off-by: Your Name +``` diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000000000000000000000000000000000..43c994c2d3617f947bcb5adf1933e21dabe46bb5 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md diff --git a/CMakeLists.txt b/CMakeLists.txt index ffeb97aaf7cfa91592246da46089519ff3001593..adcd58960c684d9a1a6bc5acc4b21436771f8d7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,7 +37,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13") # Supported AMD GPU architectures. -set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151;gfx928;gfx936;gfx938") +set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1150;gfx1151;gfx1152;gfx1153;gfx1200;gfx1201;gfx928;gfx936;gfx938") # ROCm installation prefix. Default to /opt/rocm but allow override via # -DROCM_PATH=/your/rocm/path when invoking cmake. @@ -293,6 +293,7 @@ set(VLLM_EXT_SRC "csrc/fused_qknorm_rope_kernel.cu" # "csrc/layernorm_quant_kernels.cu" "csrc/sampler.cu" + "csrc/topk.cu" "csrc/cuda_view.cu" "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/w8a8/int8/scaled_quant.cu" @@ -724,7 +725,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # CUTLASS MoE kernels # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works - # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled + # on Hopper). get_cutlass_(batched_)moe_mm_data should only be compiled # if it's possible to compile MoE kernels that use its output. cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) @@ -770,6 +771,51 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() endif() + # Expert-specialization MXFP8 blockscaled grouped kernels (SM100+). + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") + endif() + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND ES_MXFP8_GROUPED_MM_ARCHS) + set(SRCS + "csrc/moe/mxfp8_moe/cutlass_mxfp8_grouped_mm.cu" + "csrc/moe/mxfp8_moe/mxfp8_experts_quant.cu") + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${ES_MXFP8_GROUPED_MM_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_ES_MXFP8_GROUPED_MM_SM100=1") + message(STATUS "Building ES MXFP8 grouped kernels for archs: ${ES_MXFP8_GROUPED_MM_ARCHS}") + else() + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 + AND ES_MXFP8_GROUPED_MM_ARCHS) + message(STATUS "Not building ES MXFP8 grouped kernels as CUDA Compiler version is " + "not >= 12.8.") + else() + message(STATUS "Not building ES MXFP8 grouped kernels as no compatible archs found " + "in CUDA target architectures.") + endif() + endif() + + # DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later) + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}") + endif() + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS) + set(DSV3_FUSED_A_GEMM_SRC "csrc/dsv3_fused_a_gemm.cu") + set_gencode_flags_for_srcs( + SRCS "${DSV3_FUSED_A_GEMM_SRC}" + CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}") + list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC}) + message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}") + else() + message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found " + "in CUDA target architectures.") + endif() + # moe_data.cu is used by all CUTLASS MoE kernels. if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}") @@ -952,7 +998,8 @@ set(VLLM_MOE_EXT_SRC if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu" - "csrc/moe/grouped_topk_kernels.cu") + "csrc/moe/grouped_topk_kernels.cu" + "csrc/moe/router_gemm.cu") endif() if(VLLM_GPU_LANG STREQUAL "CUDA") @@ -1081,6 +1128,27 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Not building Marlin MOE kernels as no compatible archs found" " in CUDA target architectures") endif() + + # DeepSeek V3 router GEMM kernel - requires SM90+ + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(DSV3_ROUTER_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}") + endif() + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_ROUTER_GEMM_ARCHS) + set(DSV3_ROUTER_GEMM_SRC + "csrc/moe/dsv3_router_gemm_entry.cu" + "csrc/moe/dsv3_router_gemm_float_out.cu" + "csrc/moe/dsv3_router_gemm_bf16_out.cu") + set_gencode_flags_for_srcs( + SRCS "${DSV3_ROUTER_GEMM_SRC}" + CUDA_ARCHS "${DSV3_ROUTER_GEMM_ARCHS}") + list(APPEND VLLM_MOE_EXT_SRC "${DSV3_ROUTER_GEMM_SRC}") + message(STATUS "Building DSV3 router GEMM kernel for archs: ${DSV3_ROUTER_GEMM_ARCHS}") + else() + message(STATUS "Not building DSV3 router GEMM kernel as no compatible archs found" + " (requires SM90+ and CUDA >= 12.0)") + endif() endif() message(STATUS "Enabling moe extension.") diff --git a/benchmarks/attention_benchmarks/README.md b/benchmarks/attention_benchmarks/README.md index 788ce94f23fb8e275cdc931c451af63b1b52c704..afce344331670910549239b7fefaefdfdd2e174e 100644 --- a/benchmarks/attention_benchmarks/README.md +++ b/benchmarks/attention_benchmarks/README.md @@ -187,7 +187,7 @@ python benchmark.py \ ## Hardware Requirements | Backend | Hardware | -|---------|----------| +| ------- | -------- | | Flash/Triton/FlashInfer | Any CUDA GPU | | CUTLASS MLA | Blackwell (SM100+) | | FlashAttn MLA | Hopper (SM90+) | diff --git a/benchmarks/attention_benchmarks/__init__.py b/benchmarks/attention_benchmarks/__init__.py index df7a6328569d7f4c1c1ec6c62e069977b6220983..2d21288700a5997ae8d0c5569f95d43f3c02a3fd 100644 --- a/benchmarks/attention_benchmarks/__init__.py +++ b/benchmarks/attention_benchmarks/__init__.py @@ -15,7 +15,6 @@ from .common import ( BenchmarkConfig, BenchmarkResult, MockLayer, - MockModelConfig, ResultsFormatter, get_attention_scale, is_mla_backend, @@ -36,7 +35,6 @@ __all__ = [ "ResultsFormatter", # Mock objects "MockLayer", - "MockModelConfig", # Utilities "setup_mla_dims", "get_attention_scale", diff --git a/benchmarks/attention_benchmarks/batch_spec.py b/benchmarks/attention_benchmarks/batch_spec.py index 41681796e2e6124d10208b054c43b1f7b5efdc0f..9f15f1d8096e7b582db99f9e5537f7b4ac55c1b5 100644 --- a/benchmarks/attention_benchmarks/batch_spec.py +++ b/benchmarks/attention_benchmarks/batch_spec.py @@ -229,3 +229,40 @@ def get_batch_stats(requests: list[BatchRequest]) -> dict: sum(r.kv_len for r in requests) / len(requests) if requests else 0 ), } + + +def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str: + """ + Classify a batch spec into a type string. + + Args: + batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k") + spec_decode_threshold: Max q_len to be considered spec-decode vs extend + + Returns: + Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)" + """ + requests = parse_batch_spec(batch_spec) + + # Classify each request + types_present = set() + for req in requests: + if req.is_decode: + types_present.add("decode") + elif req.is_prefill: + types_present.add("prefill") + elif req.is_extend: + # Distinguish spec-decode (small q_len) from extend (chunked prefill) + if req.q_len <= spec_decode_threshold: + types_present.add("spec-decode") + else: + types_present.add("extend") + + if len(types_present) == 1: + return types_present.pop() + elif len(types_present) > 1: + # Sort for consistent output + sorted_types = sorted(types_present) + return f"mixed ({'+'.join(sorted_types)})" + else: + return "unknown" diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py index ba11fca7452f7d4a8ae63322c986606b90e5117a..0329d110244c66cef1ce15bc162bf7f432be3d54 100644 --- a/benchmarks/attention_benchmarks/benchmark.py +++ b/benchmarks/attention_benchmarks/benchmark.py @@ -43,6 +43,7 @@ from common import ( ModelParameterSweep, ParameterSweep, ResultsFormatter, + batch_spec_sort_key, is_mla_backend, ) @@ -58,7 +59,9 @@ def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult: """Run MLA benchmark with appropriate backend.""" from mla_runner import run_mla_benchmark as run_mla - return run_mla(config.backend, config, **kwargs) + return run_mla( + config.backend, config, prefill_backend=config.prefill_backend, **kwargs + ) def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult: @@ -218,10 +221,13 @@ def run_model_parameter_sweep( by_param_and_spec[key].append(r) break - # Sort by param value then spec + # Sort by param value then spec (batch_size, q_len, kv_len) sorted_keys = sorted( by_param_and_spec.keys(), - key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]), + key=lambda x: ( + int(x[0]) if x[0].isdigit() else x[0], + batch_spec_sort_key(x[1]), + ), ) current_param_value = None @@ -330,7 +336,7 @@ def run_parameter_sweep( by_spec[spec] = [] by_spec[spec].append(r) - for spec in sorted(by_spec.keys()): + for spec in sorted(by_spec.keys(), key=batch_spec_sort_key): results = by_spec[spec] best = min(results, key=lambda r: r.mean_time) console.print( @@ -436,14 +442,21 @@ def main(): # Backend selection parser.add_argument( "--backends", + "--decode-backends", nargs="+", - help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, " + help="Decode backends to benchmark (flash, triton, flashinfer, cutlass_mla, " "flashinfer_mla, flashattn_mla, flashmla)", ) parser.add_argument( "--backend", help="Single backend (alternative to --backends)", ) + parser.add_argument( + "--prefill-backends", + nargs="+", + help="Prefill backends to compare (fa2, fa3, fa4). " + "Uses the first decode backend for impl construction.", + ) # Batch specifications parser.add_argument( @@ -496,15 +509,24 @@ def main(): if "description" in yaml_config: console.print(f"[dim]{yaml_config['description']}[/]") - # Override args with YAML values - # (YAML takes precedence unless CLI arg was explicitly set) - # Backend(s) - if "backend" in yaml_config: - args.backend = yaml_config["backend"] - args.backends = None - elif "backends" in yaml_config: - args.backends = yaml_config["backends"] - args.backend = None + # Override args with YAML values, but CLI args take precedence + # Check if CLI provided backends (they would be non-None and not default) + cli_backends_provided = args.backend is not None or args.backends is not None + + # Backend(s) - only use YAML if CLI didn't specify + if not cli_backends_provided: + if "backend" in yaml_config: + args.backend = yaml_config["backend"] + args.backends = None + elif "backends" in yaml_config: + args.backends = yaml_config["backends"] + args.backend = None + elif "decode_backends" in yaml_config: + args.backends = yaml_config["decode_backends"] + args.backend = None + + # Prefill backends (e.g., ["fa3", "fa4"]) + args.prefill_backends = yaml_config.get("prefill_backends", None) # Check for special modes if "mode" in yaml_config: @@ -544,13 +566,15 @@ def main(): args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads) args.block_size = model.get("block_size", args.block_size) - # Benchmark settings - if "benchmark" in yaml_config: - bench = yaml_config["benchmark"] - args.device = bench.get("device", args.device) - args.repeats = bench.get("repeats", args.repeats) - args.warmup_iters = bench.get("warmup_iters", args.warmup_iters) - args.profile_memory = bench.get("profile_memory", args.profile_memory) + # Benchmark settings (top-level keys) + if "device" in yaml_config: + args.device = yaml_config["device"] + if "repeats" in yaml_config: + args.repeats = yaml_config["repeats"] + if "warmup_iters" in yaml_config: + args.warmup_iters = yaml_config["warmup_iters"] + if "profile_memory" in yaml_config: + args.profile_memory = yaml_config["profile_memory"] # Parameter sweep configuration if "parameter_sweep" in yaml_config: @@ -604,7 +628,10 @@ def main(): # Determine backends backends = args.backends or ([args.backend] if args.backend else ["flash"]) + prefill_backends = getattr(args, "prefill_backends", None) console.print(f"Backends: {', '.join(backends)}") + if prefill_backends: + console.print(f"Prefill backends: {', '.join(prefill_backends)}") console.print(f"Batch specs: {', '.join(args.batch_specs)}") console.print() @@ -841,37 +868,93 @@ def main(): else: # Normal mode: compare backends - total = len(backends) * len(args.batch_specs) + decode_results = [] + prefill_results = [] - with tqdm(total=total, desc="Benchmarking") as pbar: - for spec in args.batch_specs: - for backend in backends: - config = BenchmarkConfig( - backend=backend, - batch_spec=spec, - num_layers=args.num_layers, - head_dim=args.head_dim, - num_q_heads=args.num_q_heads, - num_kv_heads=args.num_kv_heads, - block_size=args.block_size, - device=args.device, - repeats=args.repeats, - warmup_iters=args.warmup_iters, - profile_memory=args.profile_memory, - ) + # Run decode backend comparison + if not prefill_backends: + # No prefill backends specified: compare decode backends as before + total = len(backends) * len(args.batch_specs) - result = run_benchmark(config) - all_results.append(result) + with tqdm(total=total, desc="Benchmarking") as pbar: + for spec in args.batch_specs: + for backend in backends: + config = BenchmarkConfig( + backend=backend, + batch_spec=spec, + num_layers=args.num_layers, + head_dim=args.head_dim, + num_q_heads=args.num_q_heads, + num_kv_heads=args.num_kv_heads, + block_size=args.block_size, + device=args.device, + repeats=args.repeats, + warmup_iters=args.warmup_iters, + profile_memory=args.profile_memory, + ) - if not result.success: - console.print(f"[red]Error {backend} {spec}: {result.error}[/]") + result = run_benchmark(config) + decode_results.append(result) - pbar.update(1) + if not result.success: + console.print( + f"[red]Error {backend} {spec}: {result.error}[/]" + ) - # Display results - console.print("\n[bold green]Results:[/]") - formatter = ResultsFormatter(console) - formatter.print_table(all_results, backends) + pbar.update(1) + + console.print("\n[bold green]Results:[/]") + formatter = ResultsFormatter(console) + formatter.print_table(decode_results, backends) + + # Run prefill backend comparison + if prefill_backends: + # Use first decode backend for impl construction + decode_backend = backends[0] + total = len(prefill_backends) * len(args.batch_specs) + + console.print( + f"[yellow]Prefill comparison mode: " + f"using {decode_backend} for decode impl[/]" + ) + + with tqdm(total=total, desc="Prefill benchmarking") as pbar: + for spec in args.batch_specs: + for pb in prefill_backends: + config = BenchmarkConfig( + backend=decode_backend, + batch_spec=spec, + num_layers=args.num_layers, + head_dim=args.head_dim, + num_q_heads=args.num_q_heads, + num_kv_heads=args.num_kv_heads, + block_size=args.block_size, + device=args.device, + repeats=args.repeats, + warmup_iters=args.warmup_iters, + profile_memory=args.profile_memory, + prefill_backend=pb, + ) + + result = run_benchmark(config) + + # Label result with prefill backend name for display + labeled_config = replace(result.config, backend=pb) + result = replace(result, config=labeled_config) + prefill_results.append(result) + + if not result.success: + console.print(f"[red]Error {pb} {spec}: {result.error}[/]") + + pbar.update(1) + + console.print("\n[bold green]Prefill Backend Results:[/]") + formatter = ResultsFormatter(console) + formatter.print_table( + prefill_results, prefill_backends, compare_to_fastest=True + ) + + all_results = decode_results + prefill_results # Save results if all_results: diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py index 7155bdc3fc5bf24c79fef51f5be42d497fa48a4b..208d6273c928338e47362b74eacb0ccf01ce1bfb 100644 --- a/benchmarks/attention_benchmarks/common.py +++ b/benchmarks/attention_benchmarks/common.py @@ -10,18 +10,37 @@ from dataclasses import asdict, dataclass from pathlib import Path from typing import Any -import numpy as np import torch +from batch_spec import get_batch_type, parse_batch_spec from rich.console import Console from rich.table import Table + +def batch_spec_sort_key(spec: str) -> tuple[int, int, int]: + """ + Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len). + + This ensures results are sorted by batch size first, then query length, + then sequence length, rather than alphabetically. + """ + try: + requests = parse_batch_spec(spec) + batch_size = len(requests) + max_q_len = max(r.q_len for r in requests) if requests else 0 + max_kv_len = max(r.kv_len for r in requests) if requests else 0 + return (batch_size, max_q_len, max_kv_len) + except Exception: + # Fallback for unparsable specs + return (0, 0, 0) + + # Mock classes for vLLM attention infrastructure class MockHfConfig: """Mock HuggingFace config that satisfies vLLM's requirements.""" - def __init__(self, mla_dims: dict): + def __init__(self, mla_dims: dict, index_topk: int | None = None): self.num_attention_heads = mla_dims["num_q_heads"] self.num_key_value_heads = mla_dims["num_kv_heads"] self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"] @@ -32,6 +51,8 @@ class MockHfConfig: self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"] self.v_head_dim = mla_dims["v_head_dim"] self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"] + if index_topk is not None: + self.index_topk = index_topk def get_text_config(self): return self @@ -40,10 +61,7 @@ class MockHfConfig: # Import AttentionLayerBase at module level to avoid circular dependencies try: from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase - - _HAS_ATTENTION_LAYER_BASE = True except ImportError: - _HAS_ATTENTION_LAYER_BASE = False AttentionLayerBase = object # Fallback @@ -59,6 +77,7 @@ class MockKVBProj: self.qk_nope_head_dim = qk_nope_head_dim self.v_head_dim = v_head_dim self.out_dim = qk_nope_head_dim + v_head_dim + self.weight = torch.empty(0, dtype=torch.bfloat16) def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]: """ @@ -82,6 +101,38 @@ class MockKVBProj: return (result,) # Return as tuple to match ColumnParallelLinear API +class MockIndexer: + """Mock Indexer for sparse MLA backends. + + Provides topk_indices_buffer that sparse MLA backends use to determine + which KV cache slots to attend to for each token. + """ + + def __init__( + self, + max_num_tokens: int, + topk_tokens: int, + device: torch.device, + ): + self.topk_tokens = topk_tokens + self.topk_indices_buffer = torch.zeros( + (max_num_tokens, topk_tokens), + dtype=torch.int32, + device=device, + ) + + def fill_random_indices(self, num_tokens: int, max_kv_len: int): + """Fill topk_indices_buffer with random valid indices for benchmarking.""" + indices = torch.randint( + 0, + max_kv_len, + (num_tokens, self.topk_tokens), + dtype=torch.int32, + device=self.topk_indices_buffer.device, + ) + self.topk_indices_buffer[:num_tokens] = indices + + class MockLayer(AttentionLayerBase): """Mock attention layer with scale parameters and impl. @@ -113,95 +164,6 @@ class MockLayer(AttentionLayerBase): return self._kv_cache_spec -class MockModelConfig: - """Mock model configuration.""" - - def __init__( - self, - num_q_heads: int, - num_kv_heads: int, - head_dim: int, - dtype: torch.dtype = torch.float16, - max_model_len: int = 32768, - ): - self._n_q = num_q_heads - self._n_kv = num_kv_heads - self._d = head_dim - self.dtype = dtype - self.max_model_len = max_model_len - - def get_num_attention_heads(self, _=None) -> int: - return self._n_q - - def get_num_kv_heads(self, _=None) -> int: - return self._n_kv - - def get_head_size(self) -> int: - return self._d - - def get_num_layers(self) -> int: - """Mock method for layer count queries.""" - return 1 - - def get_sliding_window_for_layer(self, _layer_idx: int): - """Mock method for sliding window queries.""" - return None - - def get_logits_soft_cap_for_layer(self, _layer_idx: int): - """Mock method for logits soft cap queries.""" - return None - - def get_sm_scale_for_layer(self, _layer_idx: int) -> float: - """Mock method for SM scale queries.""" - return 1.0 / (self.get_head_size() ** 0.5) - - -class MockParallelConfig: - """Mock parallel configuration.""" - - pass - - -class MockCompilationConfig: - """Mock compilation configuration.""" - - def __init__(self): - self.full_cuda_graph = False - self.static_forward_context = {} - - -class MockVLLMConfig: - """Mock VLLM configuration.""" - - def __init__(self): - self.compilation_config = MockCompilationConfig() - - -class MockRunner: - """Mock GPU runner for metadata builders.""" - - def __init__( - self, - seq_lens: np.ndarray, - query_start_locs: np.ndarray, - device: torch.device, - num_q_heads: int, - num_kv_heads: int, - head_dim: int, - dtype: torch.dtype, - ): - self.model_config = MockModelConfig(num_q_heads, num_kv_heads, head_dim, dtype) - self.parallel_config = MockParallelConfig() - self.vllm_config = MockVLLMConfig() - self.seq_lens_np = seq_lens - self.query_start_loc_np = query_start_locs - self.device = device - self.attention_chunk_size = None - self.num_query_heads = num_q_heads - self.num_kv_heads = num_kv_heads - self.dtype = dtype - - @dataclass class ParameterSweep: """Configuration for sweeping a backend parameter.""" @@ -252,6 +214,7 @@ class BenchmarkConfig: use_cuda_graphs: bool = False # MLA-specific + prefill_backend: str | None = None kv_lora_rank: int | None = None qk_nope_head_dim: int | None = None qk_rope_head_dim: int | None = None @@ -316,14 +279,19 @@ class ResultsFormatter: backends: List of backend names being compared compare_to_fastest: Show percentage comparison to fastest """ - # Group by batch spec + # Group by batch spec, preserving first-occurrence order by_spec = {} + specs_order = [] for r in results: spec = r.config.batch_spec if spec not in by_spec: by_spec[spec] = {} + specs_order.append(spec) by_spec[spec][r.config.backend] = r + # Sort specs by (batch_size, q_len, kv_len) instead of alphabetically + specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key) + # Create shortened backend names for display def shorten_backend_name(name: str) -> str: """Shorten long backend names for table display.""" @@ -337,6 +305,8 @@ class ResultsFormatter: table = Table(title="Attention Benchmark Results") table.add_column("Batch\nSpec", no_wrap=True) + table.add_column("Type", no_wrap=True) + table.add_column("Batch\nSize", justify="right", no_wrap=True) multi = len(backends) > 1 for backend in backends: @@ -350,12 +320,14 @@ class ResultsFormatter: table.add_column(col_rel, justify="right", no_wrap=False) # Add rows - for spec in sorted(by_spec.keys()): + for spec in specs_order: spec_results = by_spec[spec] times = {b: r.mean_time for b, r in spec_results.items() if r.success} best_time = min(times.values()) if times else 0.0 - row = [spec] + batch_type = get_batch_type(spec) + batch_size = len(parse_batch_spec(spec)) + row = [spec, batch_type, str(batch_size)] for backend in backends: if backend in spec_results: r = spec_results[backend] @@ -486,10 +458,11 @@ def get_attention_scale(head_dim: int) -> float: def is_mla_backend(backend: str) -> bool: """ - Check if backend is an MLA backend using the backend's is_mla() property. + Check if backend is an MLA backend using the AttentionBackendEnum. Args: - backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA") + backend: Backend name matching AttentionBackendEnum exactly + (e.g., "FLASHMLA_SPARSE") Returns: True if the backend is an MLA backend, False otherwise @@ -497,7 +470,8 @@ def is_mla_backend(backend: str) -> bool: from vllm.v1.attention.backends.registry import AttentionBackendEnum try: - backend_class = AttentionBackendEnum[backend.upper()].get_class() + backend_enum = AttentionBackendEnum[backend] + backend_class = backend_enum.get_class() return backend_class.is_mla() - except (KeyError, ValueError, ImportError): + except (KeyError, ValueError, ImportError, AttributeError): return False diff --git a/benchmarks/attention_benchmarks/configs/mla_decode.yaml b/benchmarks/attention_benchmarks/configs/mla_decode.yaml index aaf4eec9b1c852ad101442ff6d518ff7f29078e5..d758654dbe802e391f5c84f9b067fab40f035564 100644 --- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml +++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml @@ -3,7 +3,7 @@ model: name: "deepseek-v3" num_layers: 60 - num_q_heads: 128 + num_q_heads: 128 # Base value, can be swept for TP simulation num_kv_heads: 1 # MLA uses single latent KV head_dim: 576 kv_lora_rank: 512 @@ -12,6 +12,13 @@ model: v_head_dim: 128 block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128 +# Model parameter sweep: simulate tensor parallelism by varying num_q_heads +# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads +model_parameter_sweep: + param_name: "num_q_heads" + values: [128, 64, 32, 16] + label_format: "{backend}_{value}h" + batch_specs: # Small batches, varying sequence lengths - "16q1s512" # 16 requests, 512 KV cache @@ -34,28 +41,30 @@ batch_specs: # Very large batches - "128q1s1k" # 128 requests, 1k KV cache - "128q1s2k" # 128 requests, 2k KV cache + - "128q1s4k" # 128 requests, 4k KV cache + - "128q1s8k" # 128 requests, 8k KV cache # Long context - "32q1s16k" # 32 requests, 16k KV cache - "32q1s32k" # 32 requests, 32k KV cache backends: - - cutlass_mla - - flashinfer_mla - - flashattn_mla # Hopper only - - flashmla # Hopper only + - CUTLASS_MLA + - FLASHINFER_MLA + - FLASH_ATTN_MLA # Hopper only + - FLASHMLA # Hopper only device: "cuda:0" -repeats: 5 -warmup_iters: 3 +repeats: 100 +warmup_iters: 10 profile_memory: true # Backend-specific tuning -cutlass_mla: +CUTLASS_MLA: num_kv_splits: auto # or specific value like 4, 8, 16 -flashattn_mla: +FLASH_ATTN_MLA: reorder_batch_threshold: 512 -flashmla: +FLASHMLA: reorder_batch_threshold: 1 diff --git a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml index ad3c0dced6ec696243a04b25e0546a7f7e13718c..b555d90cbf6296f376118f4c7499b01925d2c2bf 100644 --- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml +++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml @@ -45,10 +45,10 @@ batch_specs: - "4q4k_60q1s4k" # 4 prefill + 60 decode backends: - - cutlass_mla - - flashinfer_mla - - flashattn_mla # Hopper only - - flashmla # Hopper only + - CUTLASS_MLA + - FLASHINFER_MLA + - FLASH_ATTN_MLA # Hopper only + - FLASHMLA # Hopper only device: "cuda:0" repeats: 5 diff --git a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml new file mode 100644 index 0000000000000000000000000000000000000000..122dbd783c5b26154ed60e5958f26a32f6db506e --- /dev/null +++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml @@ -0,0 +1,126 @@ +# MLA prefill backend comparison +# +# Compares all available MLA prefill backends: +# FA backends: fa2, fa3, fa4 (FlashAttention versions) +# Non-FA: flashinfer, cudnn, trtllm (Blackwell-only, require flashinfer) +# +# Uses cutlass_mla as the decode backend for impl construction +# (only the prefill path is exercised). +# +# Backends that aren't available on the current platform will report errors +# in the results table (e.g., fa3 on Blackwell, cudnn without artifactory). +# +# Usage: +# python benchmark.py --config configs/mla_prefill.yaml + +description: "MLA prefill backend comparison" + +model: + name: "deepseek-v3" + num_layers: 60 + num_q_heads: 128 + num_kv_heads: 1 + head_dim: 576 + kv_lora_rank: 512 + qk_nope_head_dim: 128 + qk_rope_head_dim: 64 + v_head_dim: 128 + block_size: 128 + +# model: +# name: "deepseek-v2-lite" +# num_layers: 27 +# num_q_heads: 16 +# num_kv_heads: 1 +# head_dim: 576 +# kv_lora_rank: 512 +# qk_nope_head_dim: 128 +# qk_rope_head_dim: 64 +# v_head_dim: 128 +# block_size: 128 + +batch_specs: + # Pure prefill + - "q512" + - "q1k" + - "q2k" + - "q4k" + - "q8k" + + # Batched pure prefill + - "2q512" + - "2q1k" + - "2q2k" + - "2q4k" + - "2q8k" + - "4q512" + - "4q1k" + - "4q2k" + - "4q4k" + - "4q8k" + - "8q512" + - "8q1k" + - "8q2k" + - "8q4k" + - "8q8k" + + # Chunked prefill / extend + # Short context + - "q128s1k" + - "q256s2k" + - "q512s4k" + - "q1ks4k" + - "q2ks8k" + - "2q128s1k" + - "2q256s2k" + - "2q512s4k" + - "2q1ks4k" + - "2q2ks8k" + - "4q128s1k" + - "4q256s2k" + - "4q512s4k" + - "4q1ks4k" + - "4q2ks8k" + - "8q128s1k" + - "8q256s2k" + - "8q512s4k" + - "8q1ks4k" + + # Medium context + - "q128s16k" + - "q512s16k" + - "q1ks16k" + - "q2ks16k" + - "2q128s16k" + - "2q512s16k" + - "2q1ks16k" + - "2q2ks16k" + - "4q128s16k" + - "4q512s16k" + - "4q1ks16k" + - "4q2ks16k" + + # Long context + - "q128s64k" + - "q512s64k" + - "q1ks64k" + - "q2ks64k" + - "2q128s64k" + - "2q512s64k" + - "2q1ks64k" + - "2q2ks64k" + +decode_backends: + - CUTLASS_MLA + +prefill_backends: + - fa2 + - fa3 + - fa4 + - flashinfer + - cudnn + - trtllm + +device: "cuda:0" +repeats: 20 +warmup_iters: 5 diff --git a/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml b/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ef6b2cb07dc70192ff428adaa0b18e32f0941e7e --- /dev/null +++ b/benchmarks/attention_benchmarks/configs/mla_sparse_prefill.yaml @@ -0,0 +1,62 @@ +# MLA prefill-only benchmark configuration for sparse backends + +model: + name: "deepseek-v3" + num_layers: 60 + num_q_heads: 128 + num_kv_heads: 1 + head_dim: 576 + kv_lora_rank: 512 + qk_nope_head_dim: 128 + qk_rope_head_dim: 64 + v_head_dim: 128 + block_size: 128 + +# Model parameter sweep: simulate tensor parallelism by varying num_q_heads +# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads +model_parameter_sweep: + param_name: "num_q_heads" + values: [128, 64, 32, 16] + label_format: "{backend}_{value}h" + +batch_specs: + # Pure prefill + - "1q512" + - "1q1k" + - "1q2k" + - "1q4k" + - "1q8k" + + # Batched pure prefill + - "2q512" + - "2q1k" + - "2q2k" + - "2q4k" + - "2q8k" + - "4q512" + - "4q1k" + - "4q2k" + - "4q4k" + - "4q8k" + - "8q512" + - "8q1k" + - "8q2k" + - "8q4k" + - "8q8k" + + # Extend + - "1q512s4k" + - "1q512s8k" + - "1q1ks8k" + - "1q2ks8k" + - "1q2ks16k" + - "1q4ks16k" + +backends: + - FLASHMLA_SPARSE + - FLASHINFER_MLA_SPARSE + +device: "cuda:0" +repeats: 10 +warmup_iters: 3 +profile_memory: true diff --git a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml index 1ea0a12b53381a0c3958be20a0456aba2999abbc..0d76ef0a358ca7584676cd3cfedf8982cd0b7b46 100644 --- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml +++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml @@ -6,7 +6,7 @@ description: "Decode vs Prefill pipeline crossover analysis" # Test FlashAttn MLA -backend: flashattn_mla +backend: FLASH_ATTN_MLA # Mode: decode_vs_prefill comparison (special sweep mode) # For each batch spec, we'll test both decode and prefill pipelines @@ -62,11 +62,10 @@ model: block_size: 128 # Benchmark settings -benchmark: - device: "cuda:0" - repeats: 15 # More repeats for spec decode variance - warmup_iters: 5 - profile_memory: false +device: "cuda:0" +repeats: 15 # More repeats for spec decode variance +warmup_iters: 5 +profile_memory: false # Output output: diff --git a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml index 56d2428fe74fd4022208c70bcb9b4cfd04638252..47b6d3604d1d256dcbfd9181cb6a8a2817f8dded 100644 --- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml +++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml @@ -41,18 +41,17 @@ batch_specs: # Backends that support query length > 1 backends: - - flashattn_mla # reorder_batch_threshold = 512 - - flashmla # reorder_batch_threshold = 1 (tunable) + - FLASH_ATTN_MLA # reorder_batch_threshold = 512 + - FLASHMLA # reorder_batch_threshold = 1 (tunable) # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism -# - flashinfer_mla +# - FLASHINFER_MLA # Benchmark settings -benchmark: - device: "cuda:0" - repeats: 10 # More repeats for statistical significance - warmup_iters: 5 - profile_memory: false +device: "cuda:0" +repeats: 10 # More repeats for statistical significance +warmup_iters: 5 +profile_memory: false # Test these threshold values for optimization parameter_sweep: diff --git a/benchmarks/attention_benchmarks/configs/standard_attention.yaml b/benchmarks/attention_benchmarks/configs/standard_attention.yaml index c0bdb98fbf62c489768bd094a3cee213ecbfcb12..deb5a4b27ff3fc4362de880b65372e3814abbf5d 100644 --- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml +++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml @@ -25,14 +25,22 @@ batch_specs: - "4q1k_16q1s2k" # 4 prefill + 16 decode - "2q4k_32q1s1k" # 2 large prefill + 32 decode - # Context extension - - "q1ks2k" # 1k query, 2k sequence (chunked prefill) + # Speculative decode (q <= 8) + - "16q2s1k" # 16 requests, 2 spec tokens, 1k KV cache + - "16q4s1k" # 16 requests, 4 spec tokens, 1k KV cache + - "16q8s1k" # 16 requests, 8 spec tokens, 1k KV cache + - "32q4s2k" # 32 requests, 4 spec tokens, 2k KV cache + - "8q8s4k" # 8 requests, 8 spec tokens, 4k KV cache + + # Context extension (chunked prefill) + - "q1ks2k" # 1k query, 2k sequence - "2q1ks4k" # 2 requests: 1k query, 4k sequence +# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER backends: - - flash - - triton - - flashinfer + - FLASH_ATTN + - TRITON_ATTN + - FLASHINFER device: "cuda:0" repeats: 5 diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py index 2c6c3aaac3605380bef964496c42ad63b2925c4b..0d612e374a12a640698ff35ca406c85941f1633a 100644 --- a/benchmarks/attention_benchmarks/mla_runner.py +++ b/benchmarks/attention_benchmarks/mla_runner.py @@ -8,14 +8,13 @@ This module provides helpers for running MLA backends without needing full VllmConfig integration. """ -import importlib - import numpy as np import torch from batch_spec import parse_batch_spec from common import ( BenchmarkResult, MockHfConfig, + MockIndexer, MockKVBProj, MockLayer, setup_mla_dims, @@ -62,6 +61,8 @@ def create_minimal_vllm_config( block_size: int = 128, max_num_seqs: int = 256, mla_dims: dict | None = None, + index_topk: int | None = None, + prefill_backend: str | None = None, ) -> VllmConfig: """ Create minimal VllmConfig for MLA benchmarks. @@ -73,6 +74,11 @@ def create_minimal_vllm_config( max_num_seqs: Maximum number of sequences mla_dims: Optional custom MLA dimensions dict. If not provided, uses setup_mla_dims(model_name) + index_topk: Optional topk value for sparse MLA backends. If provided, + the config will include index_topk for sparse attention. + prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer", + "cudnn", "trtllm"). Configures the attention config to + force the specified prefill backend. Returns: VllmConfig for benchmarking @@ -82,7 +88,7 @@ def create_minimal_vllm_config( mla_dims = setup_mla_dims(model_name) # Create mock HF config first (avoids downloading from HuggingFace) - mock_hf_config = MockHfConfig(mla_dims) + mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk) # Create a temporary minimal config.json to avoid HF downloads # This ensures consistent ModelConfig construction without network access @@ -120,16 +126,12 @@ def create_minimal_vllm_config( seed=0, max_model_len=32768, quantization=None, - quantization_param_path=None, enforce_eager=False, - max_context_len_to_capture=None, - max_seq_len_to_capture=8192, max_logprobs=20, disable_sliding_window=False, skip_tokenizer_init=True, served_model_name=None, limit_mm_per_prompt=None, - use_async_output_proc=True, config_format="auto", ) finally: @@ -147,7 +149,6 @@ def create_minimal_vllm_config( cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, - swap_space=0, cache_dtype="auto", enable_prefix_caching=False, ) @@ -166,7 +167,7 @@ def create_minimal_vllm_config( compilation_config = CompilationConfig() - return VllmConfig( + vllm_config = VllmConfig( model_config=model_config, cache_config=cache_config, parallel_config=parallel_config, @@ -174,62 +175,147 @@ def create_minimal_vllm_config( compilation_config=compilation_config, ) + if prefill_backend is not None: + prefill_cfg = get_prefill_backend_config(prefill_backend) + if prefill_cfg["flash_attn_version"] is not None: + vllm_config.attention_config.flash_attn_version = prefill_cfg[ + "flash_attn_version" + ] + vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[ + "disable_flashinfer_prefill" + ] + vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[ + "use_cudnn_prefill" + ] + vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[ + "use_trtllm_ragged_deepseek_prefill" + ] + + return vllm_config + # ============================================================================ -# Backend Configuration +# Prefill Backend Configuration # ============================================================================ - -# Backend name to class name prefix mapping -_BACKEND_NAME_MAP = { - "flashattn_mla": "FlashAttnMLA", - "flashmla": "FlashMLA", - "flashinfer_mla": "FlashInferMLA", - "cutlass_mla": "CutlassMLA", +# Maps prefill backend names to attention config overrides. +# FA backends set flash_attn_version and disable non-FA paths. +# Non-FA backends enable their specific path and disable others. +_PREFILL_BACKEND_CONFIG: dict[str, dict] = { + "fa2": { + "flash_attn_version": 2, + "disable_flashinfer_prefill": True, + "use_cudnn_prefill": False, + "use_trtllm_ragged_deepseek_prefill": False, + }, + "fa3": { + "flash_attn_version": 3, + "disable_flashinfer_prefill": True, + "use_cudnn_prefill": False, + "use_trtllm_ragged_deepseek_prefill": False, + }, + "fa4": { + "flash_attn_version": 4, + "disable_flashinfer_prefill": True, + "use_cudnn_prefill": False, + "use_trtllm_ragged_deepseek_prefill": False, + }, + "flashinfer": { + "flash_attn_version": None, + "disable_flashinfer_prefill": False, + "use_cudnn_prefill": False, + "use_trtllm_ragged_deepseek_prefill": False, + }, + "cudnn": { + "flash_attn_version": None, + "disable_flashinfer_prefill": True, + "use_cudnn_prefill": True, + "use_trtllm_ragged_deepseek_prefill": False, + }, + "trtllm": { + "flash_attn_version": None, + "disable_flashinfer_prefill": True, + "use_cudnn_prefill": False, + "use_trtllm_ragged_deepseek_prefill": True, + }, } -# Special properties that differ from defaults + +def get_prefill_backend_config(prefill_backend: str) -> dict: + """Get attention config overrides for a prefill backend.""" + if prefill_backend not in _PREFILL_BACKEND_CONFIG: + raise ValueError( + f"Unknown prefill backend: {prefill_backend!r}. " + f"Available: {list(_PREFILL_BACKEND_CONFIG.keys())}" + ) + return _PREFILL_BACKEND_CONFIG[prefill_backend] + + +# ============================================================================ +# Decode Backend Configuration +# ============================================================================ + + +# Backend-specific properties that can't be inferred from the backend class +# Keys are AttentionBackendEnum names (uppercase) _BACKEND_PROPERTIES = { - "flashmla": { + "FLASHMLA": { "query_format": "concat", # Single concatenated tensor (vs tuple) - "block_size": 64, # FlashMLA uses fixed block size }, - "flashinfer_mla": { - "block_size": 64, # FlashInfer MLA only supports 32 or 64 + "FLASHMLA_SPARSE": { + "query_format": "concat", # Single concatenated tensor (vs tuple) }, } def _get_backend_config(backend: str) -> dict: """ - Get backend configuration using naming conventions. - - All MLA backends follow the pattern: - - Module: vllm.v1.attention.backends.mla.{backend} - - Impl: {Name}Impl - - Metadata: {Name}Metadata (or MLACommonMetadata) - - DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata) - - MetadataBuilder: {Name}MetadataBuilder + Get backend configuration from AttentionBackendEnum. + + Uses the registry to get the backend class and extract configuration + from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.). + + Args: + backend: Backend name matching AttentionBackendEnum exactly + (e.g., "FLASHMLA_SPARSE") + + Returns: + Dict with backend configuration """ - if backend not in _BACKEND_NAME_MAP: - raise ValueError(f"Unknown backend: {backend}") + from vllm.v1.attention.backend import MultipleOf + from vllm.v1.attention.backends.registry import AttentionBackendEnum - name = _BACKEND_NAME_MAP[backend] + try: + backend_enum = AttentionBackendEnum[backend] + backend_class = backend_enum.get_class() + except (KeyError, ValueError) as e: + valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"] + raise ValueError( + f"Unknown backend: {backend}. " + f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}" + ) from e + + # Get block size from backend class + block_sizes = backend_class.get_supported_kernel_block_sizes() + # Use first supported block size (backends typically support one for MLA) + block_size = block_sizes[0] if block_sizes else None + if isinstance(block_size, MultipleOf): + # No fixed block size; fall back to config value + block_size = None + + # Check if sparse via class method if available + is_sparse = getattr(backend_class, "is_sparse", lambda: False)() + + # Get properties that can't be inferred props = _BACKEND_PROPERTIES.get(backend, {}) - # Check if backend uses common metadata (FlashInfer, CUTLASS) - uses_common = backend in ("flashinfer_mla", "cutlass_mla") - return { - "module": f"vllm.v1.attention.backends.mla.{backend}", - "impl_class": f"{name}Impl", - "metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata", - "decode_metadata_class": "MLACommonDecodeMetadata" - if uses_common - else f"{name}DecodeMetadata", - "builder_class": f"{name}MetadataBuilder", + "backend_class": backend_class, + "impl_class": backend_class.get_impl_cls(), + "builder_class": backend_class.get_builder_cls(), "query_format": props.get("query_format", "tuple"), - "block_size": props.get("block_size", None), + "block_size": block_size, + "is_sparse": is_sparse, } @@ -447,22 +533,26 @@ def _create_backend_impl( mla_dims: dict, vllm_config: VllmConfig, device: torch.device, + max_num_tokens: int = 8192, + index_topk: int | None = None, ): """ Create backend implementation instance. Args: - backend_cfg: Backend configuration dict + backend_cfg: Backend configuration dict from _get_backend_config() mla_dims: MLA dimension configuration vllm_config: VllmConfig instance device: Target device + max_num_tokens: Maximum number of tokens for sparse indexer buffer + index_topk: Topk value for sparse MLA backends Returns: - Tuple of (impl, layer, builder_instance) + Tuple of (impl, layer, builder_instance, indexer) """ - # Import backend classes - backend_module = importlib.import_module(backend_cfg["module"]) - impl_class = getattr(backend_module, backend_cfg["impl_class"]) + # Get classes from backend config (already resolved by _get_backend_config) + impl_class = backend_cfg["impl_class"] + builder_class = backend_cfg["builder_class"] # Calculate scale scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]) @@ -474,26 +564,44 @@ def _create_backend_impl( v_head_dim=mla_dims["v_head_dim"], ) + # Create indexer for sparse backends + indexer = None + if backend_cfg.get("is_sparse", False): + if index_topk is None: + index_topk = 2048 # Default topk for sparse MLA + indexer = MockIndexer( + max_num_tokens=max_num_tokens, + topk_tokens=index_topk, + device=device, + ) + + # Build impl kwargs + impl_kwargs = { + "num_heads": mla_dims["num_q_heads"], + "head_size": mla_dims["head_dim"], + "scale": scale, + "num_kv_heads": mla_dims["num_kv_heads"], + "alibi_slopes": None, + "sliding_window": None, + "kv_cache_dtype": "auto", + "logits_soft_cap": None, + "attn_type": "decoder", + "kv_sharing_target_layer_name": None, + "q_lora_rank": None, + "kv_lora_rank": mla_dims["kv_lora_rank"], + "qk_nope_head_dim": mla_dims["qk_nope_head_dim"], + "qk_rope_head_dim": mla_dims["qk_rope_head_dim"], + "qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"], + "v_head_dim": mla_dims["v_head_dim"], + "kv_b_proj": mock_kv_b_proj, + } + + # Add indexer for sparse backends + if indexer is not None: + impl_kwargs["indexer"] = indexer + # Create impl - impl = impl_class( - num_heads=mla_dims["num_q_heads"], - head_size=mla_dims["head_dim"], - scale=scale, - num_kv_heads=mla_dims["num_kv_heads"], - alibi_slopes=None, - sliding_window=None, - kv_cache_dtype="auto", - logits_soft_cap=None, - attn_type="decoder", - kv_sharing_target_layer_name=None, - q_lora_rank=None, - kv_lora_rank=mla_dims["kv_lora_rank"], - qk_nope_head_dim=mla_dims["qk_nope_head_dim"], - qk_rope_head_dim=mla_dims["qk_rope_head_dim"], - qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"], - v_head_dim=mla_dims["v_head_dim"], - kv_b_proj=mock_kv_b_proj, - ) + impl = impl_class(**impl_kwargs) # Initialize DCP attributes if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1): @@ -515,9 +623,7 @@ def _create_backend_impl( # Create builder instance if needed builder_instance = None - if backend_cfg["builder_class"]: - builder_class = getattr(backend_module, backend_cfg["builder_class"]) - + if builder_class: # Populate static_forward_context so builder can find the layer # MockLayer inherits from AttentionLayerBase, so isinstance checks pass vllm_config.compilation_config.static_forward_context = {"placeholder": layer} @@ -529,7 +635,7 @@ def _create_backend_impl( device=device, ) - return impl, layer, builder_instance + return impl, layer, builder_instance, indexer # ============================================================================ @@ -594,6 +700,7 @@ def _run_single_benchmark( backend_cfg: dict, mla_dims: dict, device: torch.device, + indexer=None, ) -> BenchmarkResult: """ Run a single benchmark iteration. @@ -606,6 +713,7 @@ def _run_single_benchmark( backend_cfg: Backend configuration dict mla_dims: MLA dimension configuration device: Target device + indexer: Optional MockIndexer for sparse backends Returns: BenchmarkResult with timing statistics @@ -613,7 +721,9 @@ def _run_single_benchmark( # Parse batch spec requests = parse_batch_spec(config.batch_spec) q_lens = [r.q_len for r in requests] + kv_lens = [r.kv_len for r in requests] total_q = sum(q_lens) + max_kv_len = max(kv_lens) # Determine block size block_size = backend_cfg["block_size"] or config.block_size @@ -641,13 +751,16 @@ def _run_single_benchmark( torch.bfloat16, ) + # Fill indexer with random indices for sparse backends + is_sparse = backend_cfg.get("is_sparse", False) + if is_sparse and indexer is not None: + indexer.fill_random_indices(total_q, max_kv_len) + # Determine which forward method to use based on metadata if metadata.decode is not None: - forward_fn = lambda: impl._forward_decode( - decode_inputs, kv_cache, metadata, layer - ) + forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer) elif metadata.prefill is not None: - forward_fn = lambda: impl._forward_prefill( + forward_fn = lambda: impl.forward_mha( prefill_inputs["q"], prefill_inputs["k_c_normed"], prefill_inputs["k_pe"], @@ -662,7 +775,7 @@ def _run_single_benchmark( # Warmup for _ in range(config.warmup_iters): forward_fn() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Benchmark times = [] @@ -675,7 +788,7 @@ def _run_single_benchmark( forward_fn() end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() elapsed_ms = start.elapsed_time(end) times.append(elapsed_ms / 1000.0 / config.num_layers) @@ -693,20 +806,26 @@ def _run_single_benchmark( def _run_mla_benchmark_batched( backend: str, configs_with_params: list[tuple], # [(config, threshold, num_splits), ...] + index_topk: int = 2048, + prefill_backend: str | None = None, ) -> list[BenchmarkResult]: """ Unified batched MLA benchmark runner for all backends. - Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla + Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla, + flashinfer_mla_sparse, flashmla_sparse This function reuses backend initialization across multiple benchmarks to avoid setup/teardown overhead. Args: - backend: Backend name + backend: Backend name (decode backend used for impl construction) configs_with_params: List of (config, threshold, num_splits) tuples - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only) - num_splits: num_kv_splits (CUTLASS only) + index_topk: Topk value for sparse MLA backends (default 2048) + prefill_backend: Prefill backend name (e.g., "fa3", "fa4"). + When set, forces the specified FlashAttention version for prefill. Returns: List of BenchmarkResult objects @@ -716,7 +835,7 @@ def _run_mla_benchmark_batched( backend_cfg = _get_backend_config(backend) device = torch.device(configs_with_params[0][0].device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) # Determine block size config_block_size = configs_with_params[0][0].block_size @@ -730,21 +849,75 @@ def _run_mla_benchmark_batched( if mla_dims is None: mla_dims = setup_mla_dims("deepseek-v3") + # Determine if this is a sparse backend + is_sparse = backend_cfg.get("is_sparse", False) + # Create and set vLLM config for MLA (reused across all benchmarks) vllm_config = create_minimal_vllm_config( model_name="deepseek-v3", # Used only for model path block_size=block_size, mla_dims=mla_dims, # Use custom dims from config or default + index_topk=index_topk if is_sparse else None, + prefill_backend=prefill_backend, ) results = [] with set_current_vllm_config(vllm_config): - # Create backend impl, layer, and builder (reused across benchmarks) - impl, layer, builder_instance = _create_backend_impl( - backend_cfg, mla_dims, vllm_config, device + # Clear cached prefill backend detection functions so they re-evaluate + # with the current VllmConfig. These are @functools.cache decorated and + # would otherwise return stale results from a previous backend's config. + from vllm.model_executor.layers.attention.mla_attention import ( + use_cudnn_prefill, + use_flashinfer_prefill, + use_trtllm_ragged_deepseek_prefill, + ) + + use_flashinfer_prefill.cache_clear() + use_cudnn_prefill.cache_clear() + use_trtllm_ragged_deepseek_prefill.cache_clear() + + # Create backend impl, layer, builder, and indexer (reused across benchmarks) + impl, layer, builder_instance, indexer = _create_backend_impl( + backend_cfg, + mla_dims, + vllm_config, + device, + index_topk=index_topk if is_sparse else None, ) + # Verify the actual prefill backend matches what was requested + if prefill_backend is not None: + prefill_cfg = get_prefill_backend_config(prefill_backend) + fa_version = prefill_cfg["flash_attn_version"] + + if fa_version is not None: + # FA backend: verify the impl's FA version + actual_fa_version = getattr(impl, "vllm_flash_attn_version", None) + if actual_fa_version != fa_version: + raise RuntimeError( + f"Prefill backend '{prefill_backend}' requested FA " + f"version {fa_version}, but the impl is using FA " + f"version {actual_fa_version}. Check " + f"vllm/v1/attention/backends/fa_utils.py." + ) + else: + # Non-FA backend: verify the builder picked the right path + expected_flags = { + "flashinfer": "_use_fi_prefill", + "cudnn": "_use_cudnn_prefill", + "trtllm": "_use_trtllm_ragged_prefill", + } + flag_name = expected_flags.get(prefill_backend) + if flag_name and not getattr(builder_instance, flag_name, False): + raise RuntimeError( + f"Prefill backend '{prefill_backend}' was requested " + f"but the metadata builder did not enable it. This " + f"usually means a dependency is missing (e.g., " + f"flashinfer not installed) or the platform doesn't " + f"support it." + ) + # Run each benchmark with the shared impl for config, threshold, num_splits in configs_with_params: # Set threshold for this benchmark (FlashAttn/FlashMLA only) @@ -768,6 +941,7 @@ def _run_mla_benchmark_batched( backend_cfg, mla_dims, device, + indexer=indexer, ) results.append(result) @@ -793,20 +967,27 @@ def run_mla_benchmark( config, reorder_batch_threshold: int | None = None, num_kv_splits: int | None = None, + index_topk: int = 2048, + prefill_backend: str | None = None, ) -> BenchmarkResult | list[BenchmarkResult]: """ Unified MLA benchmark runner for all backends. - Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla + Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla, + flashinfer_mla_sparse, flashmla_sparse Always uses batched execution internally for optimal performance. Args: - backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla) + backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla, + flashinfer_mla_sparse, flashmla_sparse) config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA (single config mode only) num_kv_splits: Number of KV splits for CUTLASS (single config mode only) + index_topk: Topk value for sparse MLA backends (default 2048) + prefill_backend: Prefill backend name (e.g., "fa3", "fa4"). + When set, forces the specified FlashAttention version for prefill. Returns: BenchmarkResult (single mode) or list of BenchmarkResult (batched mode) @@ -816,9 +997,9 @@ def run_mla_benchmark( # Already in batched format if len(config) > 0 and isinstance(config[0], tuple): # Format: [(cfg, param), ...] where param is threshold or num_splits - if backend in ("flashattn_mla", "flashmla"): + if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"): configs_with_params = [(cfg, param, None) for cfg, param in config] - else: # cutlass_mla or flashinfer_mla + else: # cutlass_mla, flashinfer_mla, or sparse backends configs_with_params = [(cfg, None, param) for cfg, param in config] else: # Format: [cfg, ...] - just configs @@ -830,7 +1011,9 @@ def run_mla_benchmark( return_single = True # Use unified batched execution - results = _run_mla_benchmark_batched(backend, configs_with_params) + results = _run_mla_benchmark_batched( + backend, configs_with_params, index_topk, prefill_backend=prefill_backend + ) # Return single result or list based on input return results[0] if return_single else results diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py index bf08a1550c0cea6bad6d70b7a3ea157a717ec75c..6af56e0e94f57276323773a375b8a9ef39cc9bcb 100644 --- a/benchmarks/attention_benchmarks/runner.py +++ b/benchmarks/attention_benchmarks/runner.py @@ -8,7 +8,9 @@ This module provides helpers for running standard attention backends (FlashAttention, Triton, FlashInfer) with real vLLM integration. """ +import logging import types +from contextlib import contextmanager import numpy as np import torch @@ -24,8 +26,13 @@ from vllm.config import ( ParallelConfig, SchedulerConfig, VllmConfig, + set_current_vllm_config, +) +from vllm.v1.attention.backends.utils import ( + CommonAttentionMetadata, + get_kv_cache_layout, + set_kv_cache_layout, ) -from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import FullAttentionSpec # ============================================================================ @@ -33,37 +40,41 @@ from vllm.v1.kv_cache_interface import FullAttentionSpec # ============================================================================ -_BACKEND_CONFIG = { - "flash": { - "module": "vllm.v1.attention.backends.flash_attn", - "backend_class": "FlashAttentionBackend", - "dtype": torch.float16, - "cache_layout": "standard", - # ^ [2, num_blocks, block_size, num_kv_heads, head_dim] - }, - "triton": { - "module": "vllm.v1.attention.backends.triton_attn", - "backend_class": "TritonAttentionBackend", - "dtype": torch.float32, - "cache_layout": "standard", - }, - "flashinfer": { - "module": "vllm.v1.attention.backends.flashinfer", - "backend_class": "FlashInferBackend", - "dtype": torch.float16, - "cache_layout": "flashinfer", - # ^ [num_blocks, 2, block_size, num_kv_heads, head_dim] - }, -} +def _get_backend_config(backend: str) -> dict: + """ + Get backend configuration from AttentionBackendEnum. + + Args: + backend: Backend name matching AttentionBackendEnum exactly + (e.g., "FLASH_ATTN", "TRITON_ATTN", "FLASHINFER") + Returns: + Dict with backend_class + """ + from vllm.v1.attention.backends.registry import AttentionBackendEnum -def _get_backend_config(backend: str) -> dict: - if backend not in _BACKEND_CONFIG: + try: + backend_enum = AttentionBackendEnum[backend] + backend_class = backend_enum.get_class() + except (KeyError, ValueError) as e: + valid_backends = [b.name for b in AttentionBackendEnum if b.name != "CUSTOM"] raise ValueError( - f"Unknown backend: {backend}. " - f"Available: {', '.join(_BACKEND_CONFIG.keys())}" - ) - return _BACKEND_CONFIG[backend] + f"Unknown backend: {backend}. Valid backends: {valid_backends}" + ) from e + + return {"backend_class": backend_class} + + +@contextmanager +def log_warnings_and_errors_only(): + """Temporarily set vLLM logger to WARNING level.""" + logger = logging.getLogger("vllm") + old_level = logger.level + logger.setLevel(logging.WARNING) + try: + yield + finally: + logger.setLevel(old_level) # ============================================================================ @@ -88,11 +99,7 @@ def _build_common_attn_metadata( query_start_loc_cpu = query_start_loc.cpu() seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device) - seq_lens_cpu = seq_lens.cpu() - max_seq_len = int(seq_lens_cpu.max()) - - context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)] - num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32) + max_seq_len = int(seq_lens.max().item()) max_blocks = (max(kv_lens) + block_size - 1) // block_size num_blocks = batch_size * max_blocks @@ -107,8 +114,6 @@ def _build_common_attn_metadata( query_start_loc=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, seq_lens=seq_lens, - seq_lens_cpu=seq_lens_cpu, - num_computed_tokens_cpu=num_computed_tokens_cpu, num_reqs=batch_size, num_actual_tokens=total_tokens, max_query_len=max_query_len, @@ -121,7 +126,6 @@ def _build_common_attn_metadata( def _create_vllm_config( config: BenchmarkConfig, - dtype: torch.dtype, max_num_blocks: int, ) -> VllmConfig: """Create a VllmConfig for benchmarking with mock model methods.""" @@ -129,7 +133,7 @@ def _create_vllm_config( model="meta-llama/Meta-Llama-3-8B", tokenizer="meta-llama/Meta-Llama-3-8B", trust_remote_code=False, - dtype=dtype, + dtype="auto", # Use model's native dtype seed=0, max_model_len=1024, ) @@ -137,7 +141,6 @@ def _create_vllm_config( cache_config = CacheConfig( block_size=config.block_size, cache_dtype="auto", - swap_space=0, ) cache_config.num_gpu_blocks = max_num_blocks cache_config.num_cpu_blocks = 0 @@ -198,15 +201,12 @@ def _create_backend_impl( backend_cfg: dict, config: BenchmarkConfig, device: torch.device, + dtype: torch.dtype, ): """Create backend implementation instance.""" - import importlib - - backend_module = importlib.import_module(backend_cfg["module"]) - backend_class = getattr(backend_module, backend_cfg["backend_class"]) + backend_class = backend_cfg["backend_class"] scale = get_attention_scale(config.head_dim) - dtype = backend_cfg["dtype"] impl = backend_class.get_impl_cls()( num_heads=config.num_q_heads, @@ -227,7 +227,7 @@ def _create_backend_impl( layer = MockLayer(device, kv_cache_spec=kv_cache_spec) - return backend_class, impl, layer, dtype + return backend_class, impl, layer def _create_metadata_builder( @@ -235,11 +235,44 @@ def _create_metadata_builder( kv_cache_spec: FullAttentionSpec, vllm_config: VllmConfig, device: torch.device, + backend_name: str = "", ): """Create metadata builder instance.""" - return backend_class.get_builder_cls()( + layer_names = ["layer_0"] + builder_cls = backend_class.get_builder_cls() + + # Flashinfer needs get_per_layer_parameters mocked since we don't have + # real model layers registered + if backend_name == "FLASHINFER": + import unittest.mock + + from vllm.v1.attention.backends.utils import PerLayerParameters + + def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls): + head_size = vllm_config.model_config.get_head_size() + return { + layer_name: PerLayerParameters( + window_left=-1, # No sliding window + logits_soft_cap=0.0, # No soft cap + sm_scale=1.0 / (head_size**0.5), # Standard scale + ) + for layer_name in layer_names + } + + with unittest.mock.patch( + "vllm.v1.attention.backends.flashinfer.get_per_layer_parameters", + mock_get_per_layer_parameters, + ): + return builder_cls( + kv_cache_spec=kv_cache_spec, + layer_names=layer_names, + vllm_config=vllm_config, + device=device, + ) + + return builder_cls( kv_cache_spec=kv_cache_spec, - layer_names=["layer_0"], + layer_names=layer_names, vllm_config=vllm_config, device=device, ) @@ -281,39 +314,44 @@ def _create_input_tensors( def _create_kv_cache( config: BenchmarkConfig, max_num_blocks: int, - cache_layout: str, + backend_class, device: torch.device, dtype: torch.dtype, ) -> list: - """Create KV cache tensors for all layers.""" - if cache_layout == "flashinfer": - # FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim] - cache_list = [ - torch.zeros( - max_num_blocks, - 2, - config.block_size, - config.num_kv_heads, - config.head_dim, - device=device, - dtype=dtype, - ) - for _ in range(config.num_layers) - ] - else: - # Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim] - cache_list = [ - torch.zeros( - 2, - max_num_blocks, - config.block_size, - config.num_kv_heads, - config.head_dim, - device=device, - dtype=dtype, - ) - for _ in range(config.num_layers) - ] + """Create KV cache tensors for all layers using the backend's methods. + + Uses the backend's get_kv_cache_shape() and get_kv_cache_stride_order() + to create the cache with the correct shape and memory layout. + """ + # Get the logical shape from the backend + cache_shape = backend_class.get_kv_cache_shape( + num_blocks=max_num_blocks, + block_size=config.block_size, + num_kv_heads=config.num_kv_heads, + head_size=config.head_dim, + ) + + # Get the stride order for custom memory layout + try: + stride_order = backend_class.get_kv_cache_stride_order() + assert len(stride_order) == len(cache_shape) + except (AttributeError, NotImplementedError): + stride_order = tuple(range(len(cache_shape))) + + # Permute shape to physical layout order + physical_shape = tuple(cache_shape[i] for i in stride_order) + + # Compute inverse permutation to get back to logical view + inv_order = [stride_order.index(i) for i in range(len(stride_order))] + + cache_list = [] + for _ in range(config.num_layers): + # Allocate in physical layout order (contiguous in memory) + cache = torch.zeros(*physical_shape, device=device, dtype=dtype) + # Permute to logical view + cache = cache.permute(*inv_order) + cache_list.append(cache) + return cache_list @@ -352,7 +390,7 @@ def _run_single_benchmark( attn_metadata, output=out, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Benchmark times = [] @@ -373,15 +411,15 @@ def _run_single_benchmark( ) end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() elapsed_ms = start.elapsed_time(end) times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer mem_stats = {} if config.profile_memory: mem_stats = { - "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2, - "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2, + "allocated_mb": torch.accelerator.memory_allocated(device) / 1024**2, + "reserved_mb": torch.accelerator.memory_reserved(device) / 1024**2, } return times, mem_stats @@ -396,7 +434,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult: """ Run standard attention benchmark with real kernels. - Supports: flash, triton, flashinfer + Supports: FLASH_ATTN, TRITON_ATTN, FLASHINFER Args: config: Benchmark configuration @@ -405,66 +443,85 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult: BenchmarkResult with timing and memory statistics """ device = torch.device(config.device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) backend_cfg = _get_backend_config(config.backend) requests = parse_batch_spec(config.batch_spec) - if config.backend == "flashinfer": + if config.backend == "FLASHINFER": requests = reorder_for_flashinfer(requests) q_lens = [r.q_len for r in requests] kv_lens = [r.kv_len for r in requests] total_q = sum(q_lens) max_kv = max(kv_lens) + batch_size = len(q_lens) - max_num_blocks = (max_kv + config.block_size - 1) // config.block_size - - backend_class, impl, layer, dtype = _create_backend_impl( - backend_cfg, config, device - ) + # Calculate total blocks needed: batch_size * max_blocks_per_request + max_blocks_per_request = (max_kv + config.block_size - 1) // config.block_size + max_num_blocks = batch_size * max_blocks_per_request + + # Suppress vLLM logs during setup to reduce spam + with log_warnings_and_errors_only(): + # Create vllm_config first - uses model's native dtype via "auto" + vllm_config = _create_vllm_config(config, max_num_blocks) + dtype = vllm_config.model_config.dtype + + # Wrap everything in set_current_vllm_config context + # This is required for backends like flashinfer that need global config + with set_current_vllm_config(vllm_config): + backend_class, impl, layer = _create_backend_impl( + backend_cfg, config, device, dtype + ) - common_metadata = _build_common_attn_metadata( - q_lens, kv_lens, config.block_size, device - ) + # Set KV cache layout if the backend requires a specific one + # (e.g., FlashInfer requires HND on SM100/Blackwell for TRTLLM attention) + required_layout = backend_class.get_required_kv_cache_layout() + if required_layout is not None: + set_kv_cache_layout(required_layout) + get_kv_cache_layout.cache_clear() - kv_cache_spec = FullAttentionSpec( - block_size=config.block_size, - num_kv_heads=config.num_kv_heads, - head_size=config.head_dim, - dtype=dtype, - ) + common_metadata = _build_common_attn_metadata( + q_lens, kv_lens, config.block_size, device + ) - vllm_config = _create_vllm_config(config, dtype, max_num_blocks) + kv_cache_spec = FullAttentionSpec( + block_size=config.block_size, + num_kv_heads=config.num_kv_heads, + head_size=config.head_dim, + dtype=dtype, + ) - builder = _create_metadata_builder( - backend_class, kv_cache_spec, vllm_config, device - ) + builder = _create_metadata_builder( + backend_class, kv_cache_spec, vllm_config, device, config.backend + ) - attn_metadata = builder.build( - common_prefix_len=0, - common_attn_metadata=common_metadata, - ) + attn_metadata = builder.build( + common_prefix_len=0, + common_attn_metadata=common_metadata, + ) - q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype) + q_list, k_list, v_list = _create_input_tensors( + config, total_q, device, dtype + ) - cache_list = _create_kv_cache( - config, max_num_blocks, backend_cfg["cache_layout"], device, dtype - ) + cache_list = _create_kv_cache( + config, max_num_blocks, backend_class, device, dtype + ) - times, mem_stats = _run_single_benchmark( - config, - impl, - layer, - q_list, - k_list, - v_list, - cache_list, - attn_metadata, - device, - dtype, - ) + times, mem_stats = _run_single_benchmark( + config, + impl, + layer, + q_list, + k_list, + v_list, + cache_list, + attn_metadata, + device, + dtype, + ) mean_time = np.mean(times) throughput = total_q / mean_time if mean_time > 0 else 0 diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md index 9a9600e08dafeccbfeff11ae3450c83b22c9f999..9b2a1ed45b1fbcae69358e207d79dcdd2464d170 100644 --- a/benchmarks/auto_tune/README.md +++ b/benchmarks/auto_tune/README.md @@ -41,7 +41,7 @@ MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LE | --- | --- | --- | | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` | | `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` | -| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` | +| `SYSTEM` | **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` | | `TP` | **Required.** The tensor-parallelism size. | `1` | | `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) | | `INPUT_LEN` | **Required.** Request input length. | `4000` | diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index a245e2022e605f8478279f1367cd4aa79fd6a200..c06b76be5ee68166939c560de7453ec4cfe0506f 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -46,10 +46,10 @@ echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL" echo "RESULT_FILE=$RESULT" echo "====================== AUTO TUNEPARAMETERS ====================" -rm -rf $LOG_FOLDER -rm -rf $PROFILE_PATH -mkdir -p $LOG_FOLDER -mkdir -p $PROFILE_PATH +rm -rf "$LOG_FOLDER" +rm -rf "$PROFILE_PATH" +mkdir -p "$LOG_FOLDER" +mkdir -p "$PROFILE_PATH" cd "$BASE/vllm" @@ -85,7 +85,6 @@ start_server() { # Each argument and its value are separate elements. local common_args_array=( "$MODEL" - "--disable-log-requests" "--port" "8004" "--host" "$HOSTNAME" "--gpu-memory-utilization" "$gpu_memory_utilization" @@ -114,7 +113,7 @@ start_server() { # wait for 10 minutes... server_started=0 - for i in {1..60}; do + for _ in {1..60}; do # This line checks whether the server is still alive or not, # since that we should always have permission to send signal to the server process. kill -0 $server_pid 2> /dev/null || break @@ -145,12 +144,12 @@ run_benchmark() { local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" echo "vllm_log: $vllm_log" echo - rm -f $vllm_log + rm -f "$vllm_log" pkill -if "vllm serve" || true echo "starting server..." # Call start_server without a profile_dir to avoid profiling overhead - start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log "" + start_server "$gpu_memory_utilization" "$max_num_seqs" "$max_num_batched_tokens" "$vllm_log" "" result=$? if [[ "$result" -eq 1 ]]; then echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" @@ -168,15 +167,15 @@ run_benchmark() { # --profile flag is removed from this call vllm bench serve \ --backend vllm \ - --model $MODEL \ + --model "$MODEL" \ --dataset-name random \ --random-input-len $adjusted_input_len \ - --random-output-len $OUTPUT_LEN \ + --random-output-len "$OUTPUT_LEN" \ --ignore-eos \ --disable-tqdm \ --request-rate inf \ --percentile-metrics ttft,tpot,itl,e2el \ - --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ + --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \ --num-prompts 1000 \ --random-prefix-len $prefix_len \ --host "$HOSTNAME" \ @@ -195,20 +194,20 @@ run_benchmark() { request_rate=$((${throughput%.*} + 1)) while ((request_rate > 0)); do # clear prefix cache - curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache + curl -X POST http://"${HOSTNAME}":8004/reset_prefix_cache sleep 5 bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" vllm bench serve \ --backend vllm \ - --model $MODEL \ + --model "$MODEL" \ --dataset-name random \ --random-input-len $adjusted_input_len \ - --random-output-len $OUTPUT_LEN \ + --random-output-len "$OUTPUT_LEN" \ --ignore-eos \ --disable-tqdm \ --request-rate $request_rate \ --percentile-metrics ttft,tpot,itl,e2el \ - --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ + --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \ --num-prompts 100 \ --random-prefix-len $prefix_len \ --host "$HOSTNAME" \ @@ -255,7 +254,7 @@ gpu_memory_utilization=0.98 find_gpu_memory_utilization=0 while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do # Pass empty string for profile_dir argument - start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" "" + start_server "$gpu_memory_utilization" "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" "" result=$? if [[ "$result" -eq 0 ]]; then find_gpu_memory_utilization=1 @@ -274,7 +273,7 @@ fi for num_seqs in "${num_seqs_list[@]}"; do for num_batched_tokens in "${num_batched_tokens_list[@]}"; do - run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization + run_benchmark "$num_seqs" "$num_batched_tokens" "$gpu_memory_utilization" done done echo "finish permutations" @@ -285,7 +284,7 @@ echo "finish permutations" if (( $(echo "$best_throughput > 0" | bc -l) )); then echo echo "Benchmark tuning finished. Now running profiling on the best configuration found..." - echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput" + echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput, goodput: $best_goodput" echo vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt" @@ -293,7 +292,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then # Start server with the best params and profiling ENABLED echo "Starting server for profiling..." - start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH" + start_server "$gpu_memory_utilization" "$best_max_num_seqs" "$best_num_batched_tokens" "$vllm_log" "$PROFILE_PATH" # Run benchmark with the best params and the --profile flag echo "Running benchmark with profiling..." @@ -301,15 +300,15 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then adjusted_input_len=$(( INPUT_LEN - prefix_len )) vllm bench serve \ --backend vllm \ - --model $MODEL \ + --model "$MODEL" \ --dataset-name random \ --random-input-len $adjusted_input_len \ - --random-output-len $OUTPUT_LEN \ + --random-output-len "$OUTPUT_LEN" \ --ignore-eos \ --disable-tqdm \ - --request-rate $best_request_rate \ + --request-rate "$best_request_rate" \ --percentile-metrics ttft,tpot,itl,e2el \ - --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ + --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \ --num-prompts 100 \ --random-prefix-len $prefix_len \ --host "$HOSTNAME" \ diff --git a/benchmarks/auto_tune/batch_auto_tune.sh b/benchmarks/auto_tune/batch_auto_tune.sh index 57ef20daf6b7144a83e0eb7f0685a9ab378b9f8c..0f3ef0f0385d2e221b8720f3cfd5829c3154999f 100755 --- a/benchmarks/auto_tune/batch_auto_tune.sh +++ b/benchmarks/auto_tune/batch_auto_tune.sh @@ -64,7 +64,7 @@ for i in $(seq 0 $(($num_runs - 1))); do else STATUS="FAILURE" ((FAILURE_COUNT++)) - FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)") + FAILED_RUNS+=("Run #$((i+1)): $(echo "$run_object" | jq -c .)") fi RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE") diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 831b76b66e096be235badb0c0cc7f5a428dbb738..a69637bfc437dd10079774a4943ca603dc9a2e20 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -649,9 +649,3 @@ ASYNC_REQUEST_FUNCS = { "sglang": async_request_openai_completions, "llama.cpp": async_request_openai_completions, } - -OPENAI_COMPATIBLE_BACKENDS = [ - k - for k, v in ASYNC_REQUEST_FUNCS.items() - if v in (async_request_openai_completions, async_request_openai_chat_completions) -] diff --git a/benchmarks/benchmark_topk_topp.py b/benchmarks/benchmark_topk_topp.py new file mode 100644 index 0000000000000000000000000000000000000000..f727f16ea29c0a9120e7b21092bd6740b60780c9 --- /dev/null +++ b/benchmarks/benchmark_topk_topp.py @@ -0,0 +1,474 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Benchmark comparing Triton vs PyTorch sort-based top-k/top-p implementations. + +Compares: +- apply_top_k_top_p_triton (Triton binary search) +- apply_top_k_top_p (PyTorch sort-based) + +Scenarios: +- top_k only (whole batch, partial batch) +- top_p only (whole batch, partial batch) +- mix of top_k and top_p +""" + +import argparse +import gc +from dataclasses import dataclass + +import torch + +from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_pytorch +from vllm.v1.sample.ops.topk_topp_triton import ( + apply_top_k_top_p_triton, + reset_buffer_cache, +) + + +@dataclass +class BenchmarkConfig: + """Configuration for a benchmark run.""" + + name: str + batch_size: int + vocab_size: int + # k and p can be tensors or None + k_values: torch.Tensor | None # [batch_size] or None + p_values: torch.Tensor | None # [batch_size] or None + description: str + ops_pct: float = 0.0 # Percentage of ops relative to batch size + + +def calculate_ops_pct( + k_values: torch.Tensor | None, + p_values: torch.Tensor | None, + vocab_size: int, + batch_size: int, +) -> float: + """ + Calculate the percentage of active top-k and top-p operations. + + Returns percentage where 100% = batch_size ops. + E.g., if all rows have both top-k and top-p active, returns 200%. + """ + active_ops = 0 + + if k_values is not None: + # Count rows where k < vocab_size (active top-k filtering) + active_ops += (k_values < vocab_size).sum().item() + + if p_values is not None: + # Count rows where p < 1.0 (active top-p filtering) + active_ops += (p_values < 1.0).sum().item() + + return (active_ops / batch_size) * 100 if batch_size > 0 else 0.0 + + +def create_logits( + batch_size: int, vocab_size: int, device: str = "cuda" +) -> torch.Tensor: + """Create random logits mimicking a realistic LLM distribution. + + Uses a Zipf-like probability distribution (rank^-1.1) converted to logits + via log, then randomly permuted per row. This produces a peaked distribution + where a small number of tokens capture most probability mass, similar to + real model outputs. + """ + # Create Zipf-like probabilities: p(rank) ~ rank^(-alpha) + ranks = torch.arange(1, vocab_size + 1, dtype=torch.float32, device=device) + probs = ranks.pow(-1.1) + probs = probs / probs.sum() + + # Convert to logits (log-probabilities, unnormalized is fine) + base_logits = probs.log() + + # Broadcast to batch and randomly permute each row + logits = base_logits.unsqueeze(0).expand(batch_size, -1).clone() + for i in range(batch_size): + logits[i] = logits[i, torch.randperm(vocab_size, device=device)] + + return logits + + +def measure_memory() -> tuple[int, int]: + """Return (allocated, reserved) memory in bytes.""" + torch.accelerator.synchronize() + return ( + torch.accelerator.memory_allocated(), + torch.accelerator.max_memory_allocated(), + ) + + +def reset_memory_stats(): + """Reset peak memory statistics.""" + reset_buffer_cache() + torch.accelerator.reset_peak_memory_stats() + torch.accelerator.empty_cache() + gc.collect() + + +def benchmark_function( + func, + logits: torch.Tensor, + k: torch.Tensor | None, + p: torch.Tensor | None, + warmup_iters: int = 5, + benchmark_iters: int = 20, +) -> tuple[float, int]: + """ + Benchmark a function and return (avg_time_ms, peak_memory_bytes). + + Returns average time in milliseconds and peak memory usage. + """ + # Warmup + for _ in range(warmup_iters): + logits_copy = logits.clone() + func(logits_copy, k, p) + torch.accelerator.synchronize() + + # Reset memory stats before benchmark + reset_memory_stats() + + # Benchmark + start_events = [ + torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters) + ] + end_events = [torch.cuda.Event(enable_timing=True) for _ in range(benchmark_iters)] + + for i in range(benchmark_iters): + logits_copy = logits.clone() + start_events[i].record() + func(logits_copy, k, p) + end_events[i].record() + + torch.accelerator.synchronize() + + # Calculate timing + times = [ + start_events[i].elapsed_time(end_events[i]) for i in range(benchmark_iters) + ] + avg_time = sum(times) / len(times) + + # Get peak memory + _, peak_memory = measure_memory() + + return avg_time, peak_memory + + +def create_benchmark_configs( + batch_sizes: list[int], + vocab_sizes: list[int], + device: str = "cuda", +) -> list[BenchmarkConfig]: + """Create all benchmark configurations.""" + configs = [] + + for vocab_size in vocab_sizes: + for batch_size in batch_sizes: + # 1. Top-k only - whole batch (all rows have k < vocab_size) + k_all = torch.full((batch_size,), 50, dtype=torch.int32, device=device) + configs.append( + BenchmarkConfig( + name=f"topk_whole_b{batch_size}_v{vocab_size // 1000}k", + batch_size=batch_size, + vocab_size=vocab_size, + k_values=k_all, + p_values=None, + description=f"Top-k only (whole batch, k=50), " + f"batch={batch_size}, vocab={vocab_size}", + ops_pct=calculate_ops_pct(k_all, None, vocab_size, batch_size), + ) + ) + + # 2. Top-k only - partial batch (half have k=50, half have k=vocab_size) + k_partial = torch.full((batch_size,), 50, dtype=torch.int32, device=device) + k_partial[batch_size // 2 :] = vocab_size # No filtering for second half + configs.append( + BenchmarkConfig( + name=f"topk_partial_b{batch_size}_v{vocab_size // 1000}k", + batch_size=batch_size, + vocab_size=vocab_size, + k_values=k_partial, + p_values=None, + description=f"Top-k only (partial batch, 50% k=50, 50% k=vocab), " + f"batch={batch_size}, vocab={vocab_size}", + ops_pct=calculate_ops_pct(k_partial, None, vocab_size, batch_size), + ) + ) + + # 3. Top-p only - whole batch (all rows have p < 1.0) + p_all = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device) + configs.append( + BenchmarkConfig( + name=f"topp_whole_b{batch_size}_v{vocab_size // 1000}k", + batch_size=batch_size, + vocab_size=vocab_size, + k_values=None, + p_values=p_all, + description=f"Top-p only (whole batch, p=0.9), " + f"batch={batch_size}, vocab={vocab_size}", + ops_pct=calculate_ops_pct(None, p_all, vocab_size, batch_size), + ) + ) + + # 4. Top-p only - partial batch (half have p=0.9, half have p=1.0) + p_partial = torch.full( + (batch_size,), 0.9, dtype=torch.float32, device=device + ) + p_partial[batch_size // 2 :] = 1.0 # No filtering for second half + configs.append( + BenchmarkConfig( + name=f"topp_partial_b{batch_size}_v{vocab_size // 1000}k", + batch_size=batch_size, + vocab_size=vocab_size, + k_values=None, + p_values=p_partial, + description=f"Top-p only (partial batch, 50% p=0.9, 50% p=1.0), " + f"batch={batch_size}, vocab={vocab_size}", + ops_pct=calculate_ops_pct(None, p_partial, vocab_size, batch_size), + ) + ) + + # 5. Mix of top-k and top-p (both applied to whole batch) + k_mix = torch.full((batch_size,), 100, dtype=torch.int32, device=device) + p_mix = torch.full((batch_size,), 0.9, dtype=torch.float32, device=device) + configs.append( + BenchmarkConfig( + name=f"topk_topp_whole_b{batch_size}_v{vocab_size // 1000}k", + batch_size=batch_size, + vocab_size=vocab_size, + k_values=k_mix, + p_values=p_mix, + description=f"Top-k + Top-p (whole batch, k=100, p=0.9), " + f"batch={batch_size}, vocab={vocab_size}", + ops_pct=calculate_ops_pct(k_mix, p_mix, vocab_size, batch_size), + ) + ) + + # 6. Mix with partial application (some rows k only, some p only, some both) + k_mixed = torch.full( + (batch_size,), vocab_size, dtype=torch.int32, device=device + ) + p_mixed = torch.full((batch_size,), 1.0, dtype=torch.float32, device=device) + # First third: k only + third = batch_size // 3 + k_mixed[:third] = 50 + # Second third: p only + p_mixed[third : 2 * third] = 0.5 + # Last third: both k and p + k_mixed[2 * third :] = 100 + p_mixed[2 * third :] = 0.9 + configs.append( + BenchmarkConfig( + name=f"mixed_partial_b{batch_size}_v{vocab_size // 1000}k", + batch_size=batch_size, + vocab_size=vocab_size, + k_values=k_mixed, + p_values=p_mixed, + description=f"Mixed partial (1/3 k=50, 1/3 p=0.9, 1/3 both), " + f"batch={batch_size}, vocab={vocab_size}", + ops_pct=calculate_ops_pct(k_mixed, p_mixed, vocab_size, batch_size), + ) + ) + + return configs + + +def format_memory(bytes_val: int) -> str: + """Format memory in human-readable form.""" + if bytes_val >= 1024**3: + return f"{bytes_val / (1024**3):.2f} GB" + elif bytes_val >= 1024**2: + return f"{bytes_val / (1024**2):.2f} MB" + elif bytes_val >= 1024: + return f"{bytes_val / 1024:.2f} KB" + return f"{bytes_val} B" + + +def run_benchmark( + configs: list[BenchmarkConfig], + warmup_iters: int = 5, + benchmark_iters: int = 20, + verbose: bool = True, +): + """Run all benchmarks and print results.""" + results = [] + + print("=" * 100) + print("Top-k/Top-p Benchmark: Triton vs PyTorch Sort-based") + print("=" * 100) + print() + + for config in configs: + if verbose: + print(f"Running: {config.description}") + + # Create fresh logits for this config + logits = create_logits(config.batch_size, config.vocab_size) + + # Benchmark Triton + reset_memory_stats() + triton_time, triton_mem = benchmark_function( + apply_top_k_top_p_triton, + logits, + config.k_values, + config.p_values, + warmup_iters, + benchmark_iters, + ) + + # Benchmark PyTorch + reset_memory_stats() + pytorch_time, pytorch_mem = benchmark_function( + apply_top_k_top_p_pytorch, + logits, + config.k_values, + config.p_values, + warmup_iters, + benchmark_iters, + ) + + speedup = pytorch_time / triton_time if triton_time > 0 else float("inf") + mem_ratio = pytorch_mem / triton_mem if triton_mem > 0 else float("inf") + + result = { + "config": config, + "triton_time_ms": triton_time, + "pytorch_time_ms": pytorch_time, + "triton_mem": triton_mem, + "pytorch_mem": pytorch_mem, + "speedup": speedup, + "mem_ratio": mem_ratio, + } + results.append(result) + + if verbose: + print(f" Triton: {triton_time:.3f} ms, {format_memory(triton_mem)}") + print(f" PyTorch: {pytorch_time:.3f} ms, {format_memory(pytorch_mem)}") + print(f" Speedup: {speedup:.2f}x, Memory ratio: {mem_ratio:.2f}x") + print() + + # Clean up + del logits + reset_memory_stats() + + return results + + +def print_summary_table(results: list[dict]): + """Print a summary table of results.""" + print() + print("=" * 130) + print("SUMMARY TABLE") + print("=" * 130) + print() + + # Header + header = ( + f"{'Scenario':<40} {'Batch':>6} {'Vocab':>7} {'Ops%':>6} " + f"{'Triton (ms)':>12} {'PyTorch (ms)':>13} {'Speedup':>8} " + f"{'Tri Mem':>10} {'Pyt Mem':>10}" + ) + print(header) + print("-" * 130) + + # Group by scenario type + current_vocab = None + for result in results: + config = result["config"] + + # Add separator between vocab sizes + if current_vocab != config.vocab_size: + if current_vocab is not None: + print("-" * 130) + current_vocab = config.vocab_size + + scenario = config.name.split("_b")[0] # Extract scenario name + print( + f"{scenario:<40} {config.batch_size:>6} {config.vocab_size:>7} " + f"{config.ops_pct:>5.0f}% " + f"{result['triton_time_ms']:>12.3f} {result['pytorch_time_ms']:>13.3f} " + f"{result['speedup']:>7.2f}x " + f"{format_memory(result['triton_mem']):>10} " + f"{format_memory(result['pytorch_mem']):>10}" + ) + + print("=" * 130) + + +def main(): + parser = argparse.ArgumentParser( + description="Benchmark Triton vs PyTorch sort-based top-k/top-p implementations" + ) + parser.add_argument( + "--batch-sizes", + type=int, + nargs="+", + default=[1, 4, 16, 64, 128, 512, 1024, 2048], + help="Batch sizes to test (default: 1 4 16 64)", + ) + parser.add_argument( + "--vocab-sizes", + type=int, + nargs="+", + default=[32768, 131072], # 32k, 128k + help="Vocabulary sizes to test (default: 32768 131072)", + ) + parser.add_argument( + "--warmup-iters", + type=int, + default=5, + help="Number of warmup iterations (default: 5)", + ) + parser.add_argument( + "--benchmark-iters", + type=int, + default=20, + help="Number of benchmark iterations (default: 20)", + ) + parser.add_argument( + "--quiet", + action="store_true", + help="Only print summary table", + ) + + args = parser.parse_args() + + # Print configuration + print(f"Batch sizes: {args.batch_sizes}") + print(f"Vocab sizes: {args.vocab_sizes}") + print(f"Warmup iterations: {args.warmup_iters}") + print(f"Benchmark iterations: {args.benchmark_iters}") + print() + + # Check CUDA + if not torch.cuda.is_available(): + print("ERROR: CUDA is not available. This benchmark requires a GPU.") + return + + device_name = torch.cuda.get_device_name(0) + print(f"GPU: {device_name}") + print() + + # Create configs + configs = create_benchmark_configs( + args.batch_sizes, + args.vocab_sizes, + ) + + # Run benchmarks + results = run_benchmark( + configs, + warmup_iters=args.warmup_iters, + benchmark_iters=args.benchmark_iters, + verbose=not args.quiet, + ) + + # Print summary + print_summary_table(results) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index f0d661f9d53498f5067bfb0ef7d19d52af84699d..5865473e95426bcc89ab4c4130de76ca81e34d49 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -1,78 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import argparse -import json -import math -import os import time from types import TracebackType -from typing import Any - - -def convert_to_pytorch_benchmark_format( - args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any] -) -> list: - """ - Save the benchmark results in the format used by PyTorch OSS benchmark with - on metric per record - https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database - """ - records = [] - if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): - return records - - for name, benchmark_values in metrics.items(): - record = { - "benchmark": { - "name": "vLLM benchmark", - "extra_info": { - "args": vars(args), - }, - }, - "model": { - "name": args.model, - }, - "metric": { - "name": name, - "benchmark_values": benchmark_values, - "extra_info": extra_info, - }, - } - - tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size") - # Save tensor_parallel_size parameter if it's part of the metadata - if not tp and "tensor_parallel_size" in extra_info: - record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = ( - extra_info["tensor_parallel_size"] - ) - - records.append(record) - - return records - - -class InfEncoder(json.JSONEncoder): - def clear_inf(self, o: Any): - if isinstance(o, dict): - return {k: self.clear_inf(v) for k, v in o.items()} - elif isinstance(o, list): - return [self.clear_inf(v) for v in o] - elif isinstance(o, float) and math.isinf(o): - return "inf" - return o - - def iterencode(self, o: Any, *args, **kwargs) -> Any: - return super().iterencode(self.clear_inf(o), *args, **kwargs) - - -def write_to_json(filename: str, records: list) -> None: - with open(filename, "w") as f: - json.dump( - records, - f, - cls=InfEncoder, - default=lambda o: f"<{type(o).__name__} object is not JSON serializable>", - ) # Collect time and generate time metrics diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py index b4f3c6bf94eda0e1bf1d253def3b17d273415dcc..6cbcf6b68c89fc9e2719ccce8ab948276558fa2f 100644 --- a/benchmarks/cutlass_benchmarks/utils.py +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Cutlass bench utils -from collections.abc import Iterable import torch @@ -86,15 +85,3 @@ def make_rand_sparse_tensors( # Compressed B, Metadata, Original A, B return b_compressed, e, a, b - - -def make_n_rand_sparse_tensors( - num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int -) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]: - ABs = [] - for _ in range(num_tensors): - b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) - if b_comp is not None: - ABs.append(make_rand_sparse_tensors(dtype, m, n, k)) - BComps, Es, As, Bs = zip(*ABs) - return list(BComps), list(Es), list(As), list(Bs) diff --git a/benchmarks/disagg_benchmarks/rate_limiter.py b/benchmarks/disagg_benchmarks/rate_limiter.py deleted file mode 100644 index 87ac8cb6ab1a91d59bc4ea89b362521ce051b841..0000000000000000000000000000000000000000 --- a/benchmarks/disagg_benchmarks/rate_limiter.py +++ /dev/null @@ -1,45 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import time - - -class RateLimiter: - """Token bucket rate limiter implementation""" - - def __init__(self, rate_limit): - self.rate_limit = rate_limit # Requests per second - self.num_available_tokens = rate_limit # Available tokens - self.last_refill = time.monotonic() # Last token refill time - self.lock = asyncio.Lock() # Synchronization lock - - async def acquire(self): - """Acquire a token from the rate limiter""" - while True: - async with self.lock: - current_time = time.monotonic() - elapsed = current_time - self.last_refill - - # Refill num_available_tokens if more than 1 second has passed - if elapsed > 1.0: - self.num_available_tokens = self.rate_limit - self.last_refill = current_time - - # Check if num_available_tokens are available - if self.num_available_tokens > 0: - self.num_available_tokens -= 1 - return True - - # Calculate wait time if no num_available_tokens available - wait_time = 1.0 - elapsed - await asyncio.sleep(wait_time) - - async def __aenter__(self): - """Enter async context manager - acquire token""" - await self.acquire() - return self - - async def __aexit__(self, exc_type, exc_value, traceback): - """Exit async context manager - no cleanup needed""" - pass diff --git a/benchmarks/disagg_benchmarks/request_queue.py b/benchmarks/disagg_benchmarks/request_queue.py deleted file mode 100644 index 410bcb956050e8dad1ccc1e70d10cbdf38fa67da..0000000000000000000000000000000000000000 --- a/benchmarks/disagg_benchmarks/request_queue.py +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -from collections import deque - - -class RequestQueue: - """Request queue manager with concurrency control""" - - def __init__(self, max_concurrent, max_queue_size): - # Maximum concurrent requests - self.max_concurrent = max_concurrent - self.max_queue_size = max_queue_size # Maximum queue size - # Concurrency control - self.semaphore = asyncio.Semaphore(max_concurrent) - self.queue = deque() # Request queue - self.queue_size = 0 # Current queue size - self.lock = asyncio.Lock() # Sync queue Lock - - async def enqueue(self, task): - """Add a request task to the queue""" - async with self.lock: - if self.queue_size >= self.max_queue_size: - return False - - self.queue.append(task) - self.queue_size += 1 - return True - - async def process(self): - """Process queued requests using semaphore for concurrency control""" - while True: - if self.queue: - async with self.semaphore, self.lock: - task = self.queue.popleft() - self.queue_size -= 1 - await task - await asyncio.sleep(0.01) # Yield control to event loop diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index fb3329975cee3f3fb5b94665ff854c0fa8a0d3b3..4978a8777ab5c765ca855b06e872a37ca52ba6fb 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -13,6 +13,7 @@ from torch.utils.benchmark import Measurement as TMeasurement from tqdm import tqdm import vllm._custom_ops as ops +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, @@ -291,6 +292,7 @@ def print_timers(timers: Iterable[TMeasurement]): compare.print() +@default_vllm_config() def main(): torch.set_default_device("cuda") bench_params = get_bench_params() diff --git a/benchmarks/kernels/bench_concat_mla_q.py b/benchmarks/kernels/bench_concat_mla_q.py new file mode 100644 index 0000000000000000000000000000000000000000..8d940484d6b37a70c1eed4b93d8cccdfa24e0349 --- /dev/null +++ b/benchmarks/kernels/bench_concat_mla_q.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse + +import torch + +from vllm import _custom_ops as ops +from vllm.triton_utils import triton + +# DeepSeek V3 dimensions +NOPE_DIM = 512 +ROPE_DIM = 64 +NUM_HEADS = 128 + +NUM_TOKENS = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192] + + +def get_configs(): + return NUM_TOKENS + + +def make_inputs(num_tokens, dtype): + """Create inputs matching the real code path. + + Args: + contiguous_nope: If False, simulate the transposed BMM output + (non-contiguous nope with stride pattern from + [N,B,L].transpose(0,1)). + """ + # Simulate: bmm output [N, B, L].transpose(0, 1) -> [B, N, L] + raw = torch.randn(NUM_HEADS, num_tokens, NOPE_DIM, dtype=dtype, device="cuda") + ql_nope = raw.transpose(0, 1) + + q_pe = torch.randn(num_tokens, NUM_HEADS, ROPE_DIM, dtype=dtype, device="cuda") + return ql_nope, q_pe + + +# ---- Non-contiguous nope benchmark (real code path) ---- +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["num_tokens"], + x_vals=get_configs(), + line_arg="provider", + line_vals=["torch_cat", "concat_mla_q"], + line_names=["torch.cat", "concat_mla_q (v8)"], + styles=[("blue", "--"), ("green", "-")], + ylabel="Latency (us)", + plot_name="concat_mla_q-transposed", + args={}, + ) +) +def bench_transposed(num_tokens, provider): + dtype = torch.bfloat16 + ql_nope, q_pe = make_inputs(num_tokens, dtype) + + q_out = torch.empty( + num_tokens, NUM_HEADS, NOPE_DIM + ROPE_DIM, dtype=dtype, device="cuda" + ) + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch_cat": + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.cat((ql_nope, q_pe), dim=-1), quantiles=quantiles, rep=500 + ) + else: + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: ops.concat_mla_q(ql_nope, q_pe, q_out), quantiles=quantiles, rep=500 + ) + + return ms * 1000, max_ms * 1000, min_ms * 1000 # us + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Benchmark concat_mla_q vs torch.cat") + parser.add_argument( + "--save-path", type=str, default=None, help="Path to save benchmark results" + ) + args = parser.parse_args() + + print("\n" + "=" * 70) + print("CONCAT MLA Q KERNEL BENCHMARKS") + print("=" * 70) + print(f"Dimensions: nope={NOPE_DIM}, rope={ROPE_DIM}, heads={NUM_HEADS}") + print( + f"Per-head output: {NOPE_DIM + ROPE_DIM} bf16 = " + f"{(NOPE_DIM + ROPE_DIM) * 2} bytes" + ) + print(f"num_tokens (decode=batch_size, prefill=chunk_size): {NUM_TOKENS}") + print("=" * 70) + + print("\n--- Non-contiguous nope inputs (transposed BMM output) ---") + bench_transposed.run(print_data=True, save_path=args.save_path) + + print("\n" + "=" * 70) + print("Benchmarking complete!") + print("=" * 70) diff --git a/benchmarks/kernels/bench_cp_gather_fp8.py b/benchmarks/kernels/bench_cp_gather_fp8.py new file mode 100644 index 0000000000000000000000000000000000000000..19fc84c4df76761183e6623dae8c214fb5e54d20 --- /dev/null +++ b/benchmarks/kernels/bench_cp_gather_fp8.py @@ -0,0 +1,153 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import math + +import torch + +from vllm import _custom_ops as ops +from vllm.triton_utils import triton + +# DeepSeek V3 MLA dimensions +NOPE_DIM = 512 +ROPE_DIM = 64 +HEAD_DIM = NOPE_DIM + ROPE_DIM # 576 BF16 output elements per token +ENTRY_BYTES = 656 # 512 FP8 + 16 scales + 128 BF16 RoPE +BLOCK_SIZE = 64 # tokens per physical cache block - get_supported_kernel_block_sizes + +# Realistic prefill scenarios: +# - 1 long prefill: single request, 16K-96K tokens +# - 4 medium prefills: 4 requests, 4K-24K tokens each +# - 16 shorter prefills: 16 requests, 1K-6K tokens each +SCENARIOS = [ + # (label, num_reqs, total_tokens_list) + ("1-req", 1, [8192, 16384, 32768, 65536, 98304]), + ("4-reqs", 4, [8192, 16384, 32768, 65536, 98304]), + ("16-reqs", 16, [8192, 16384, 32768, 65536, 98304]), +] + + +def make_inputs(total_tokens, num_reqs, block_size): + """Create synthetic FP8 cache, block table, and output buffer. + + Fills the cache with random bytes (we only measure throughput, + not correctness). Block table maps each request to contiguous + physical blocks. + """ + # Divide tokens evenly across requests + base_len = total_tokens // num_reqs + remainder = total_tokens % num_reqs + seq_lens = [base_len + (1 if r < remainder else 0) for r in range(num_reqs)] + + # workspace_starts: cumulative sum of seq_lens + workspace_starts = [0] * num_reqs + for r in range(1, num_reqs): + workspace_starts[r] = workspace_starts[r - 1] + seq_lens[r - 1] + + # Physical blocks needed per request + blocks_per_req = [math.ceil(s / block_size) for s in seq_lens] + total_blocks = sum(blocks_per_req) + max_blocks = max(blocks_per_req) + + # Allocate cache with random data (content doesn't matter for perf) + cache = torch.randint( + 0, + 256, + (total_blocks, block_size, ENTRY_BYTES), + dtype=torch.uint8, + device="cuda", + ) + + # Block table: contiguous block assignments + block_table = torch.zeros(num_reqs, max_blocks, dtype=torch.int32, device="cuda") + block_idx = 0 + for r in range(num_reqs): + for b in range(blocks_per_req[r]): + block_table[r, b] = block_idx + block_idx += 1 + + # Output workspace + dst = torch.zeros(total_tokens, HEAD_DIM, dtype=torch.bfloat16, device="cuda") + + seq_lens_t = torch.tensor(seq_lens, dtype=torch.int32, device="cuda") + workspace_starts_t = torch.tensor( + workspace_starts, dtype=torch.int32, device="cuda" + ) + + return cache, dst, block_table, seq_lens_t, workspace_starts_t + + +def bench_scenario(label, num_reqs, total_tokens_list, save_path): + """Run benchmark for a specific (num_reqs, total_tokens) scenario.""" + + @triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["total_tokens"], + x_vals=total_tokens_list, + line_arg="provider", + line_vals=["cuda_kernel"], + line_names=["cp_gather_fp8 (CUDA)"], + styles=[("green", "-")], + ylabel="Latency (us)", + plot_name=f"cp_gather_fp8-{label}-bs{BLOCK_SIZE}", + args={"num_reqs": num_reqs}, + ) + ) + def bench_fn(total_tokens, provider, num_reqs): + cache, dst, block_table, seq_lens_t, ws_starts = make_inputs( + total_tokens, num_reqs, BLOCK_SIZE + ) + + quantiles = [0.5, 0.2, 0.8] + + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: ops.cp_gather_and_upconvert_fp8_kv_cache( + cache, dst, block_table, seq_lens_t, ws_starts, num_reqs + ), + quantiles=quantiles, + rep=500, + ) + + return ms * 1000, max_ms * 1000, min_ms * 1000 # us + + seq_len_per_req = total_tokens_list[0] // num_reqs + seq_len_per_req_max = total_tokens_list[-1] // num_reqs + print( + f"\n--- {label}: {num_reqs} request(s), " + f"~{seq_len_per_req}-{seq_len_per_req_max} tokens/req ---" + ) + bench_fn.run(print_data=True, save_path=save_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Benchmark cp_gather_and_upconvert_fp8_kv_cache" + ) + parser.add_argument( + "--save-path", + type=str, + default=None, + help="Path to save benchmark results as CSV", + ) + args = parser.parse_args() + + # Print data volume info for bandwidth analysis + read_per_token = ENTRY_BYTES # 656 bytes from cache + write_per_token = HEAD_DIM * 2 # 576 * 2 = 1152 bytes to workspace + total_per_token = read_per_token + write_per_token # 1808 bytes + + print("\n" + "=" * 70) + print("CP_GATHER_AND_UPCONVERT_FP8_KV_CACHE BENCHMARKS") + print("=" * 70) + print(f"Cache entry: {ENTRY_BYTES} bytes (512 FP8 + 16 scales + 128 RoPE)") + print(f"Output row: {HEAD_DIM} BF16 = {HEAD_DIM * 2} bytes") + print(f"Per token: {total_per_token} bytes (read + write)") + print(f"Block size: {BLOCK_SIZE} tokens/block") + print("=" * 70) + + for label, num_reqs, total_tokens_list in SCENARIOS: + bench_scenario(label, num_reqs, total_tokens_list, args.save_path) + + print("\n" + "=" * 70) + print("Benchmarking complete!") + print("=" * 70) diff --git a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py index 04921dafbdbea0a1b581e6210ba0560dcc603316..0dd5c6d848824b45d61bc0ba4ab134e495e61fcd 100644 --- a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py @@ -168,7 +168,7 @@ def bench_impl( # warmup for kwargs in kwargs_list: impl_type.get_impl()(**kwargs) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Merge into a single kwargs and qualify arguments as ArgPool kwargs = {k: ArgPool([]) for k in kwargs_list[0]} @@ -202,7 +202,7 @@ def test_correctness(T: int, N: int): # reference output ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE) - # test ouptut + # test output out_q, out_s = output_from_impl( ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR ) diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py index bb66e5d088ef7e40670758e9bf8747a4318d9e1e..e1cec02b7cad727ca8125beb61b80b5175fc54e3 100644 --- a/benchmarks/kernels/benchmark_activation.py +++ b/benchmarks/kernels/benchmark_activation.py @@ -7,6 +7,7 @@ import itertools import torch import vllm.model_executor.layers.activation # noqa F401 +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.custom_op import op_registry from vllm.triton_utils import triton from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -18,6 +19,7 @@ intermediate_size = [3072, 9728, 12288] configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size)) +@default_vllm_config() def benchmark_activation( batch_size: int, seq_len: int, diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/benchmark_block_fp8_gemm.py similarity index 98% rename from benchmarks/kernels/bench_block_fp8_gemm.py rename to benchmarks/kernels/benchmark_block_fp8_gemm.py index 11e3ac7f0c1fa6ab4e576ee872a79a0129adc13f..8d50c3828206dfed74f3f95cc4a517e96f5e3b56 100644 --- a/benchmarks/kernels/bench_block_fp8_gemm.py +++ b/benchmarks/kernels/benchmark_block_fp8_gemm.py @@ -8,6 +8,7 @@ os.environ["VLLM_USE_DEEP_GEMM"] = "0" import torch +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.layers.quantization.utils.fp8_utils import ( W8A8BlockFp8LinearOp, ) @@ -40,6 +41,7 @@ DEEPSEEK_V3_SHAPES = [ ] +@default_vllm_config() def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass): """Build runner function for w8a8 block fp8 matmul.""" factor_for_scale = 1e-2 diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index f1234d8213471621353d241255008bb610d11a9c..3f80b024e1081cc3986f39df7580733971561ada 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -11,12 +11,13 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from tests.kernels.moe.utils import make_dummy_moe_config from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.activation import MoEActivation +from vllm.model_executor.layers.fused_moe.all2all_utils import ( + maybe_make_prepare_finalize, +) from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk -from vllm.model_executor.layers.fused_moe.prepare_finalize import ( - MoEPrepareAndFinalizeNoEP, -) from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.worker.workspace import init_workspace_manager @@ -63,7 +64,7 @@ def bench_run( per_out_ch: bool, mkn: tuple[int, int, int], ): - init_workspace_manager(torch.cuda.current_device()) + init_workspace_manager(torch.accelerator.current_device_index()) (m, k, n) = mkn dtype = torch.half @@ -136,15 +137,21 @@ def bench_run( per_out_ch_quant=per_out_ch, ) - fn = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), + moe_config = make_dummy_moe_config( + num_experts=num_experts, + hidden_dim=k, + intermediate_size_per_partition=n, + in_dtype=a.dtype, + ) + fn = mk.FusedMoEKernel( + maybe_make_prepare_finalize( + moe=moe_config, + quant_config=quant_config, + allow_new_interface=True, + use_monolithic=False, + ), CutlassExpertsFp8( - moe_config=make_dummy_moe_config( - num_experts=num_experts, - hidden_dim=k, - intermediate_size_per_partition=n, - in_dtype=a.dtype, - ), + moe_config=moe_config, quant_config=quant_config, ), ) @@ -161,10 +168,10 @@ def bench_run( w2_fp8q_cutlass, topk_weights, topk_ids, - activation="silu", + activation=MoEActivation.SILU, global_num_experts=num_experts, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly) triton_stream = torch.cuda.Stream() @@ -180,14 +187,14 @@ def bench_run( topk_ids, quant_config=quant_config, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() def bench_cuda_graph(graph, num_warmup=5, num_iters=100): """Benchmark CUDA graph using events like benchmark_moe.py""" # Warmup for _ in range(num_warmup): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Timing start_event = torch.Event(enable_timing=True) @@ -195,7 +202,7 @@ def bench_run( latencies = [] for _ in range(num_iters): - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event.record() graph.replay() end_event.record() diff --git a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py index 64b086ea221be696863fa35be65448275d9ac046..49ba2b0c9a64889d6a747d9d6e329acccb8810c3 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py @@ -15,6 +15,9 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from tests.kernels.moe.utils import make_dummy_moe_config from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.all2all_utils import ( + maybe_make_prepare_finalize, +) from vllm.model_executor.layers.fused_moe.config import ( fp8_w8a8_moe_quant_config, nvfp4_moe_quant_config, @@ -23,9 +26,6 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import ( CutlassExpertsFp4, ) from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk -from vllm.model_executor.layers.fused_moe.prepare_finalize import ( - MoEPrepareAndFinalizeNoEP, -) from vllm.scalar_type import scalar_types from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.worker.workspace import init_workspace_manager @@ -196,10 +196,21 @@ def bench_run( g2_alphas=w2_gs, ) - kernel = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(defer_input_quant=True), + moe_config = make_dummy_moe_config( + num_experts=num_experts, + hidden_dim=k, + intermediate_size_per_partition=n, + in_dtype=a.dtype, + ) + kernel = mk.FusedMoEKernel( + maybe_make_prepare_finalize( + moe=moe_config, + quant_config=quant_config, + allow_new_interface=True, + use_monolithic=False, + ), CutlassExpertsFp4( - make_dummy_moe_config(), + moe_config=moe_config, quant_config=quant_config, ), ) @@ -240,11 +251,17 @@ def bench_run( g1_alphas=w1_gs, g2_alphas=w2_gs, ) + moe_config = make_dummy_moe_config() - kernel = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(defer_input_quant=True), + kernel = mk.FusedMoEKernel( + maybe_make_prepare_finalize( + moe=moe_config, + quant_config=quant_config, + allow_new_interface=True, + use_monolithic=False, + ), CutlassExpertsFp4( - make_dummy_moe_config(), + moe_config=moe_config, quant_config=quant_config, ), ) @@ -290,7 +307,7 @@ def bench_run( def replay_graph(graph, num_repeats): for _ in range(num_repeats): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() cutlass_stream = torch.cuda.Stream() cutlass_graph = torch.cuda.CUDAGraph() @@ -313,7 +330,7 @@ def bench_run( e=num_experts, device=device, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() triton_stream = torch.cuda.Stream() triton_graph = torch.cuda.CUDAGraph() @@ -328,7 +345,7 @@ def bench_run( w2_fp8scale, a_fp8_scale, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() min_run_time = 5 num_warmup = 5 diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py index 7b453fe7b6809957fabd9bfb772ecec98ee55999..24e22023b91d1e8b4c599af2fc9b452ba3fc7203 100644 --- a/benchmarks/kernels/benchmark_device_communicators.py +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -30,6 +30,9 @@ import torch.distributed as dist from torch.distributed import ProcessGroup from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce +from vllm.distributed.device_communicators.flashinfer_all_reduce import ( + FlashInferAllReduce, +) from vllm.distributed.device_communicators.pynccl import ( PyNcclCommunicator, register_nccl_symmetric_ops, @@ -44,7 +47,7 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser logger = init_logger(__name__) # Default sequence lengths to benchmark -DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192] +DEFAULT_SEQUENCE_LENGTHS = [16, 64, 128, 512, 1024, 2048, 4096, 8192] # Fixed hidden size and dtype for all benchmarks HIDDEN_SIZE = 8192 @@ -81,6 +84,7 @@ class CommunicatorBenchmark: self.symm_mem_comm = None self.symm_mem_comm_multimem = None self.symm_mem_comm_two_shot = None + self.fi_ar_comm = None self._init_communicators() @@ -161,6 +165,22 @@ class CommunicatorBenchmark: ) self.symm_mem_comm_two_shot = None + try: + self.fi_ar_comm = FlashInferAllReduce( + group=self.cpu_group, + device=self.device, + ) + if not self.fi_ar_comm.disabled: + logger.info("Rank %s: FlashInferAllReduce initialized", self.rank) + else: + logger.info("Rank %s: FlashInferAllReduce disabled", self.rank) + self.fi_ar_comm = None + except Exception as e: + logger.warning( + "Rank %s: Failed to initialize FlashInferAllReduce: %s", self.rank, e + ) + self.fi_ar_comm = None + def benchmark_allreduce( self, sequence_length: int, num_warmup: int, num_trials: int ) -> dict[str, float]: @@ -180,7 +200,8 @@ class CommunicatorBenchmark: lambda t, c=comm: c.custom_all_reduce(t), lambda t, c=comm: c.should_custom_ar(t), comm.capture(), - "1stage", # env variable value + {"VLLM_CUSTOM_ALLREDUCE_ALGO": "1stage"}, + None, # no destroy function ) ) # CustomAllreduce two-shot @@ -190,7 +211,8 @@ class CommunicatorBenchmark: lambda t, c=comm: c.custom_all_reduce(t), lambda t, c=comm: c.should_custom_ar(t), comm.capture(), - "2stage", # env variable value + {"VLLM_CUSTOM_ALLREDUCE_ALGO": "2stage"}, + None, # no destroy function ) ) @@ -202,7 +224,8 @@ class CommunicatorBenchmark: lambda t, c=comm: c.all_reduce(t), lambda t: True, # Always available if initialized nullcontext(), - None, # no env variable needed + {}, # no env variable needed + None, # no destroy function ) ) communicators.append( @@ -211,7 +234,8 @@ class CommunicatorBenchmark: lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t), lambda t: True, # Always available if initialized nullcontext(), - None, # no env variable needed + {}, # no env variable needed + None, # no destroy function ) ) @@ -223,7 +247,8 @@ class CommunicatorBenchmark: lambda t, c=comm: c.all_reduce(t), lambda t, c=comm: c.should_use_symm_mem(t), nullcontext(), - None, # no env variable needed + {}, # no env variable needed + None, # no destroy function ) ) @@ -235,29 +260,67 @@ class CommunicatorBenchmark: lambda t, c=comm: c.all_reduce(t), lambda t, c=comm: c.should_use_symm_mem(t), nullcontext(), - None, # no env variable needed + {}, # no env variable needed + None, # no destroy function needed ) ) - # Benchmark each communicator - for name, allreduce_fn, should_use_fn, context, env_var in communicators: - # Set environment variable if needed - if env_var is not None: - os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var - else: - # Clear the environment variable to avoid interference - os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None) - - latency = self.benchmark_allreduce_single( - sequence_length, - allreduce_fn, - should_use_fn, - context, - num_warmup, - num_trials, + if self.fi_ar_comm is not None: + comm = self.fi_ar_comm + communicators.append( + ( + "flashinfer_trtllm", + lambda t, c=comm: c.all_reduce(t), + lambda t, c=comm: c.should_use_fi_ar(t), + nullcontext(), + {"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "trtllm"}, + lambda c=comm: c.destroy(), + ) ) - if latency is not None: - results[name] = latency + communicators.append( + ( + "flashinfer_mnnvl", + lambda t, c=comm: c.all_reduce(t), + lambda t, c=comm: c.should_use_fi_ar(t), + nullcontext(), + {"VLLM_FLASHINFER_ALLREDUCE_BACKEND": "mnnvl"}, + lambda c=comm: c.destroy(), + ) + ) + + # Benchmark each communicator + for ( + name, + allreduce_fn, + should_use_fn, + context, + env_dict, + destroy_fn, + ) in communicators: + # Save original values and apply new environment variables + saved_env = {key: os.environ.get(key) for key in env_dict} + for key, value in env_dict.items(): + os.environ[key] = value + try: + latency = self.benchmark_allreduce_single( + sequence_length, + allreduce_fn, + should_use_fn, + context, + num_warmup, + num_trials, + ) + if latency is not None: + results[name] = latency + finally: + if destroy_fn is not None: + destroy_fn() + # Restore environment variables to their original state + for key, original_value in saved_env.items(): + if original_value is None: + os.environ.pop(key, None) + else: + os.environ[key] = original_value return results @@ -279,7 +342,7 @@ class CommunicatorBenchmark: if not should_use_fn(tensor): return None - torch.cuda.synchronize() + torch.accelerator.synchronize() stream = torch.cuda.Stream() with torch.cuda.stream(stream): graph_input = tensor.clone() @@ -297,17 +360,17 @@ class CommunicatorBenchmark: for _ in range(CUDA_GRAPH_CAPTURE_CYCLES): allreduce_fn(graph_input) - torch.cuda.synchronize() + torch.accelerator.synchronize() for _ in range(num_warmup): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.perf_counter() for _ in range(num_trials): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.perf_counter() @@ -432,7 +495,7 @@ def main(): # Set device device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) # Get CPU process group cpu_group = dist.new_group(backend="gloo") diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/benchmark_fp8_gemm.py similarity index 100% rename from benchmarks/kernels/bench_fp8_gemm.py rename to benchmarks/kernels/benchmark_fp8_gemm.py diff --git a/benchmarks/kernels/benchmark_fused_collective.py b/benchmarks/kernels/benchmark_fused_collective.py index 38e7fdcf55426f1e50424a2715e1f8323dcff729..05b842d7ee914e526a1e2ef739488cb50b023844 100644 --- a/benchmarks/kernels/benchmark_fused_collective.py +++ b/benchmarks/kernels/benchmark_fused_collective.py @@ -5,8 +5,11 @@ Benchmark for FlashInfer fused collective operations vs standard operations. This benchmark compares: -1. FlashInfer's trtllm_allreduce_fusion (fused allreduce + rmsnorm + optional quant) -2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations +1. FlashInfer's allreduce_fusion with trtllm backend + (fused allreduce + rmsnorm + optional FP8/FP4 quant) +2. FlashInfer's allreduce_fusion with mnnvl backend + (fused allreduce + rmsnorm only, no quantization support) +3. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations Usage with torchrun: torchrun --nproc_per_node=2 benchmark_fused_collective.py @@ -24,7 +27,6 @@ import torch.distributed as dist # type: ignore from vllm.config.vllm import CompilationConfig, VllmConfig, set_current_vllm_config from vllm.distributed import ( - get_tp_group, tensor_model_parallel_all_reduce, ) from vllm.distributed.parallel_state import ( @@ -49,14 +51,19 @@ SCALED_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant logger = init_logger(__name__) # Try to import FlashInfer +TorchDistBackend = None try: import flashinfer.comm as flashinfer_comm # type: ignore + from flashinfer.comm.mnnvl import ( # type: ignore + TorchDistBackend, + ) - if not hasattr(flashinfer_comm, "trtllm_allreduce_fusion"): + if not ( + hasattr(flashinfer_comm, "allreduce_fusion") + and hasattr(flashinfer_comm, "create_allreduce_fusion_workspace") + ): flashinfer_comm = None - logger.warning( - "FlashInfer comm module found but missing trtllm_allreduce_fusion" - ) + logger.warning("FlashInfer comm module found but missing allreduce_fusion API") except ImportError: flashinfer_comm = None logger.warning("FlashInfer not found, only benchmarking standard operations") @@ -74,57 +81,70 @@ _FI_MAX_SIZES = { 8: 64 * MiB, # 64MB } -# Global workspace tensor for FlashInfer -_FI_WORKSPACE_TENSOR = None +# Global workspace tensors for FlashInfer (keyed by backend name) +_FI_WORKSPACES: dict = {} + +# Backends to benchmark +FLASHINFER_BACKENDS = ["trtllm", "mnnvl"] def setup_flashinfer_workspace( + backend: str, world_size: int, rank: int, hidden_dim: int, max_token_num: int, - use_fp32_lamport: bool = False, + dtype: torch.dtype, ): """Setup FlashInfer workspace for fused allreduce operations.""" - global _FI_WORKSPACE_TENSOR + global FI_WORKSPACES if flashinfer_comm is None: - return None, None + return None if world_size not in _FI_MAX_SIZES: logger.warning("FlashInfer not supported for world size %s", world_size) - return None, None + return None try: - # Create IPC workspace - ipc_handles, workspace_tensor = ( - flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion( - tp_rank=rank, - tp_size=world_size, - max_token_num=max_token_num, - hidden_dim=hidden_dim, - group=get_tp_group().device_group, - use_fp32_lamport=use_fp32_lamport, - ) + kwargs = {} + if TorchDistBackend is not None: + kwargs["comm_backend"] = TorchDistBackend(group=dist.group.WORLD) + + workspace = flashinfer_comm.create_allreduce_fusion_workspace( + backend=backend, + world_size=world_size, + rank=rank, + max_token_num=max_token_num, + hidden_dim=hidden_dim, + dtype=dtype, + **kwargs, ) - _FI_WORKSPACE_TENSOR = workspace_tensor - return ipc_handles, workspace_tensor + _FI_WORKSPACES[backend] = workspace + return workspace except Exception as e: - logger.error("Failed to setup FlashInfer workspace: %s", e) - return None, None + logger.error( + "Failed to setup FlashInfer workspace (backend=%s): %s", backend, e + ) + return None -def cleanup_flashinfer_workspace(ipc_handles): - """Cleanup FlashInfer workspace.""" - if flashinfer_comm is None or ipc_handles is None: +def cleanup_flashinfer_workspaces(): + """Cleanup all FlashInfer workspaces.""" + if flashinfer_comm is None: return - try: - group = get_tp_group().device_group - flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(ipc_handles, group) - except Exception as e: - logger.error("Failed to cleanup FlashInfer workspace: %s", e) + for backend, workspace in _FI_WORKSPACES.items(): + try: + workspace.destroy() + except Exception as e: + logger.error( + "Failed to cleanup FlashInfer workspace (backend=%s): %s", + backend, + e, + ) + _FI_WORKSPACES.clear() class FlashInferFusedAllReduceParams: @@ -132,25 +152,15 @@ class FlashInferFusedAllReduceParams: def __init__( self, - rank: int, - world_size: int, - use_fp32_lamport: bool = False, max_token_num: int = 1024, ): - self.rank = rank - self.world_size = world_size - self.use_fp32_lamport = use_fp32_lamport - self.trigger_completion_at_end = True self.launch_with_pdl = True self.fp32_acc = True self.max_token_num = max_token_num - def get_trtllm_fused_allreduce_kwargs(self): + def get_flashinfer_fused_allreduce_kwargs(self): return { - "world_rank": self.rank, - "world_size": self.world_size, "launch_with_pdl": self.launch_with_pdl, - "trigger_completion_at_end": self.trigger_completion_at_end, "fp32_acc": self.fp32_acc, } @@ -161,11 +171,12 @@ def flashinfer_fused_allreduce_rmsnorm( rms_gamma: torch.Tensor, rms_eps: float, allreduce_params: "FlashInferFusedAllReduceParams", + workspace: object, use_oneshot: bool, norm_out: torch.Tensor | None = None, ): """FlashInfer fused allreduce + rmsnorm operation.""" - if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None: + if flashinfer_comm is None or workspace is None: raise RuntimeError("FlashInfer not available or workspace not initialized") if norm_out is None: @@ -174,24 +185,25 @@ def flashinfer_fused_allreduce_rmsnorm( else: residual_out = input_tensor - flashinfer_comm.trtllm_allreduce_fusion( - allreduce_in=input_tensor, - token_num=input_tensor.shape[0], + layout_code = None + if workspace.backend == "trtllm": + layout_code = flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4 + + flashinfer_comm.allreduce_fusion( + input=input_tensor, + workspace=workspace, + pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm, residual_in=residual, residual_out=residual_out, norm_out=norm_out, rms_gamma=rms_gamma, rms_eps=rms_eps, - hidden_dim=input_tensor.shape[-1], - workspace_ptrs=_FI_WORKSPACE_TENSOR, - pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm, - allreduce_out=None, quant_out=None, scale_out=None, - layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, + layout_code=layout_code, scale_factor=None, use_oneshot=use_oneshot, - **allreduce_params.get_trtllm_fused_allreduce_kwargs(), + **allreduce_params.get_flashinfer_fused_allreduce_kwargs(), ) @@ -202,12 +214,16 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant( rms_eps: float, scale_factor: torch.Tensor, allreduce_params: FlashInferFusedAllReduceParams, + workspace: object, use_oneshot: bool = True, norm_out: torch.Tensor | None = None, quant_out: torch.Tensor | None = None, ): - """FlashInfer fused allreduce + rmsnorm + FP8 quantization.""" - if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None: + """FlashInfer fused allreduce + rmsnorm + FP8 quantization. + + Note: Only supported by the trtllm backend. + """ + if flashinfer_comm is None or workspace is None: raise RuntimeError("FlashInfer not available or workspace not initialized") if norm_out is None: @@ -216,24 +232,21 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant( else: residual_out = input_tensor - flashinfer_comm.trtllm_allreduce_fusion( - allreduce_in=input_tensor, - token_num=input_tensor.shape[0], + flashinfer_comm.allreduce_fusion( + input=input_tensor, + workspace=workspace, + pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, residual_in=residual, residual_out=residual_out, norm_out=norm_out, rms_gamma=rms_gamma, rms_eps=rms_eps, - hidden_dim=input_tensor.shape[-1], - workspace_ptrs=_FI_WORKSPACE_TENSOR, - pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, - allreduce_out=None, quant_out=quant_out, scale_out=None, layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, scale_factor=scale_factor, use_oneshot=use_oneshot, - **allreduce_params.get_trtllm_fused_allreduce_kwargs(), + **allreduce_params.get_flashinfer_fused_allreduce_kwargs(), ) @@ -244,13 +257,17 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant( rms_eps: float, input_global_scale: torch.Tensor, allreduce_params: FlashInferFusedAllReduceParams, + workspace: object, quant_out: torch.Tensor, use_oneshot: bool, output_scale: torch.Tensor, norm_out: torch.Tensor | None = None, ): - """FlashInfer fused allreduce + rmsnorm + FP4 quantization.""" - if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None: + """FlashInfer fused allreduce + rmsnorm + FP4 quantization. + + Note: Only supported by the trtllm backend. + """ + if flashinfer_comm is None or workspace is None: raise RuntimeError("FlashInfer not available or workspace not initialized") if norm_out is None: @@ -259,24 +276,21 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant( else: residual_out = input_tensor - flashinfer_comm.trtllm_allreduce_fusion( - allreduce_in=input_tensor, - token_num=input_tensor.shape[0], + flashinfer_comm.allreduce_fusion( + input=input_tensor, + workspace=workspace, + pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, residual_in=residual, residual_out=residual_out, norm_out=norm_out, rms_gamma=rms_gamma, rms_eps=rms_eps, - hidden_dim=input_tensor.shape[-1], - workspace_ptrs=_FI_WORKSPACE_TENSOR, - pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, - allreduce_out=None, quant_out=quant_out, scale_out=output_scale, layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, scale_factor=input_global_scale, use_oneshot=use_oneshot, - **allreduce_params.get_trtllm_fused_allreduce_kwargs(), + **allreduce_params.get_flashinfer_fused_allreduce_kwargs(), ) @@ -371,32 +385,32 @@ def benchmark_operation( # Warmup before graph capture for _ in range(warmup): operation_func(*args, **kwargs) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Create CUDA graph graph = torch.cuda.CUDAGraph() num_op_per_cudagraph = 10 # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe - device = torch.device(f"cuda:{torch.cuda.current_device()}") + device = torch.device(f"cuda:{torch.accelerator.current_device_index()}") with graph_capture(device=device), torch.cuda.graph(graph): for _ in range(num_op_per_cudagraph): operation_func(*args, **kwargs) # Graph warmup - torch.cuda.synchronize() + torch.accelerator.synchronize() for _ in range(warmup): graph.replay() # Benchmark with CUDA graph - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.perf_counter() for _ in range(trials // num_op_per_cudagraph): # operation_func(*args, **kwargs) graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.perf_counter() avg_time_ms = ((end_time - start_time) / trials) * 1000 @@ -409,13 +423,16 @@ def run_benchmarks( dtype: torch.dtype, use_residual: bool, allreduce_params: FlashInferFusedAllReduceParams | None, + workspaces: dict, quant_modes: set[str], no_oneshot: bool, ): """Run all benchmarks for given configuration. Args: - quant_mode: "none", "fp8_only", "fp4_only", or "all" + allreduce_params: Shared parameters for FlashInfer fused allreduce. + workspaces: Dict mapping backend name ("trtllm", "mnnvl") to workspace. + quant_modes: Set of quantization modes: "none", "fp8", "fp4". """ ( input_tensor, @@ -431,18 +448,18 @@ def run_benchmarks( rms_eps = 1e-6 results = {} - vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype) use_oneshot_options = [False] if no_oneshot else [True, False] - # Create RMSNorm and QuantFP8 layers once for native benchmarks - if "none" in quant_modes: # Standard AllReduce + RMSNorm + # Re-create VllmFusedAllreduce per config so CustomOp binds the + # correct forward method (native vs custom kernel). for custom_op in ["-rms_norm", "+rms_norm"]: with set_current_vllm_config( VllmConfig(compilation_config=CompilationConfig(custom_ops=[custom_op])) ): try: + vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype) suffix = ( "_custom_rms_norm" if "+" in custom_op else "_native_rms_norm" ) @@ -461,6 +478,7 @@ def run_benchmarks( VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"])) ): try: + vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype) standard_allreduce_rmsnorm_native_compiled = torch.compile( vllm_fused_allreduce.allreduce_rmsnorm, fullgraph=True, @@ -476,10 +494,11 @@ def run_benchmarks( logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e) results["standard_allreduce_rmsnorm_native_compiled"] = float("inf") - # FlashInfer Fused AllReduce + RMSNorm Oneshot/Twoshot - if flashinfer_comm is not None and allreduce_params is not None: + # FlashInfer Fused AllReduce + RMSNorm (all backends) + for backend, workspace in workspaces.items(): for use_oneshot in use_oneshot_options: suffix = "_oneshot" if use_oneshot else "_twoshot" + key = f"flashinfer_{backend}_fused_allreduce_rmsnorm{suffix}" try: time_ms = benchmark_operation( flashinfer_fused_allreduce_rmsnorm, @@ -489,14 +508,17 @@ def run_benchmarks( rms_gamma=rms_gamma, rms_eps=rms_eps, allreduce_params=allreduce_params, + workspace=workspace, use_oneshot=use_oneshot, ) - results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = time_ms + results[key] = time_ms except Exception as e: - logger.error("FlashInfer Fused AllReduce+RMSNorm failed: %s", e) - results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = float( - "inf" + logger.error( + "FlashInfer (%s) Fused AllReduce+RMSNorm failed: %s", + backend, + e, ) + results[key] = float("inf") if "fp8" in quant_modes: # Standard AllReduce + RMSNorm + FP8 Quant @@ -505,7 +527,7 @@ def run_benchmarks( "_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm" ) for quant_fp8_custom_op in ["-quant_fp8", "+quant_fp8"]: - suffix += ( + op_suffix = suffix + ( "_custom_quant_fp8" if "+" in quant_fp8_custom_op else "_native_quant_fp8" @@ -518,16 +540,17 @@ def run_benchmarks( ) ): try: + vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype) time_ms = benchmark_operation( vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant, input_tensor, residual=residual, scale_factor=scale_fp8, ) - results[f"standard_allreduce{suffix}"] = time_ms + results[f"standard_allreduce{op_suffix}"] = time_ms except Exception as e: logger.error("Standard AllReduce+RMSNorm+FP8 failed: %s", e) - results[f"standard_allreduce{suffix}"] = float("inf") + results[f"standard_allreduce{op_suffix}"] = float("inf") # Standard AllReduce + RMSNorm + FP8 Quant Native Compiled with set_current_vllm_config( @@ -538,6 +561,7 @@ def run_benchmarks( ) ): try: + vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype) standard_allreduce_rmsnorm_fp8_quant_native_compiled = torch.compile( vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant, fullgraph=True, @@ -560,10 +584,12 @@ def run_benchmarks( "inf" ) - # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot - if flashinfer_comm is not None and allreduce_params is not None: + # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant (trtllm only) + if "trtllm" in workspaces: + trtllm_ws = workspaces["trtllm"] for use_oneshot in use_oneshot_options: suffix = "_oneshot" if use_oneshot else "_twoshot" + key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp8_quant{suffix}" try: time_ms = benchmark_operation( flashinfer_fused_allreduce_rmsnorm_fp8_quant, @@ -575,19 +601,16 @@ def run_benchmarks( scale_factor=scale_fp8, quant_out=quant_out_fp8, allreduce_params=allreduce_params, + workspace=trtllm_ws, use_oneshot=use_oneshot, ) - results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = ( - time_ms - ) + results[key] = time_ms except Exception as e: logger.error( - "FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s", + "FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP8 failed: %s", e, ) - results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = ( - float("inf") - ) + results[key] = float("inf") if "fp4" in quant_modes and current_platform.has_device_capability(100): # Standard AllReduce + RMSNorm + FP4 Quant @@ -603,6 +626,7 @@ def run_benchmarks( ) ): try: + vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype) time_ms = benchmark_operation( vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant, input_tensor, @@ -621,6 +645,7 @@ def run_benchmarks( VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"])) ): try: + vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype) standard_allreduce_rmsnorm_fp4_quant_native_compiled = torch.compile( vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant, fullgraph=True, @@ -645,10 +670,12 @@ def run_benchmarks( "inf" ) - # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot - if flashinfer_comm is not None and allreduce_params is not None: + # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant (trtllm only) + if "trtllm" in workspaces: + trtllm_ws = workspaces["trtllm"] for use_oneshot in use_oneshot_options: suffix = "_oneshot" if use_oneshot else "_twoshot" + key = f"flashinfer_trtllm_fused_allreduce_rmsnorm_fp4_quant{suffix}" try: time_ms = benchmark_operation( flashinfer_fused_allreduce_rmsnorm_fp4_quant, @@ -659,49 +686,18 @@ def run_benchmarks( rms_eps=rms_eps, input_global_scale=scale_fp4, allreduce_params=allreduce_params, + workspace=trtllm_ws, quant_out=fp4_quant_out, output_scale=fp4_output_scale, use_oneshot=use_oneshot, ) - results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = ( - time_ms - ) + results[key] = time_ms except Exception as e: logger.error( - "FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s", + "FlashInfer (trtllm) Fused AllReduce+RMSNorm+FP4 failed: %s", e, ) - results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = ( - float("inf") - ) - - # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Two-shot - if flashinfer_comm is not None and allreduce_params is not None: - try: - time_ms = benchmark_operation( - flashinfer_fused_allreduce_rmsnorm_fp4_quant, - input_tensor, - residual=residual, - norm_out=norm_out, - rms_gamma=rms_gamma, - rms_eps=rms_eps, - input_global_scale=scale_fp4, - allreduce_params=allreduce_params, - quant_out=fp4_quant_out, - output_scale=fp4_output_scale, - use_oneshot=False, - ) - results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = ( - time_ms - ) - except Exception as e: - logger.error( - "FlashInfer Fused AllReduce+RMSNorm+FP4 Two-shot failed: %s", - e, - ) - results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = float( - "inf" - ) + results[key] = float("inf") return results @@ -988,7 +984,7 @@ def main(): world_size = int(os.environ["WORLD_SIZE"]) device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) init_distributed_environment() @@ -1039,24 +1035,33 @@ def main(): configs = list(itertools.product(args.num_tokens, dtypes, residual_options)) - # Setup FlashInfer workspace if available - ipc_handles = None + # Setup FlashInfer workspaces for all backends allreduce_params = None if flashinfer_comm is not None: # Use the largest hidden dimension for workspace setup + max_element_size = max(torch.finfo(dt).bits // 8 for dt in dtypes) + workspace_dtype = ( + torch.float32 + if max_element_size == 4 + else (torch.bfloat16 if torch.bfloat16 in dtypes else torch.float16) + ) max_num_token = _FI_MAX_SIZES.get(world_size) // ( - args.hidden_dim * world_size * 2 + args.hidden_dim * max_element_size ) - ipc_handles, workspace_tensor = setup_flashinfer_workspace( - world_size, rank, args.hidden_dim, max_num_token - ) + for backend in FLASHINFER_BACKENDS: + setup_flashinfer_workspace( + backend=backend, + world_size=world_size, + rank=rank, + hidden_dim=args.hidden_dim, + max_token_num=max_num_token, + dtype=workspace_dtype, + ) - if workspace_tensor is not None: + if _FI_WORKSPACES: allreduce_params = FlashInferFusedAllReduceParams( - rank=rank, - world_size=world_size, max_token_num=max_num_token, ) @@ -1081,6 +1086,7 @@ def main(): dtype, use_residual, allreduce_params, + workspaces=_FI_WORKSPACES, quant_modes=quant_modes, no_oneshot=args.no_oneshot, ) @@ -1119,11 +1125,13 @@ def main(): finally: # Cleanup - if ipc_handles is not None: - cleanup_flashinfer_workspace(ipc_handles) + cleanup_flashinfer_workspaces() dist.barrier() if __name__ == "__main__": - main() + from vllm.config import VllmConfig, set_current_vllm_config + + with set_current_vllm_config(VllmConfig()): + main() diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 7b5daa62eb34a01b3dd07829f9074202f7e5680a..dd4060bbdb940b5a45eae8dd71d34b051c408a72 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -9,15 +9,15 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from tests.kernels.moe.utils import make_dummy_moe_config from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.all2all_utils import ( + maybe_make_prepare_finalize, +) from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_experts, fused_topk, ) -from vllm.model_executor.layers.fused_moe.prepare_finalize import ( - MoEPrepareAndFinalizeNoEP, -) from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.worker.workspace import init_workspace_manager @@ -50,7 +50,7 @@ def bench_run( per_out_ch: bool, mkn: tuple[int, int, int], ): - init_workspace_manager(torch.cuda.current_device()) + init_workspace_manager(torch.accelerator.current_device_index()) label = "Quant Matmul" sub_label = ( @@ -131,16 +131,22 @@ def bench_run( w2_scale=w2_scale, per_act_token_quant=per_act_token, ) + moe_config = make_dummy_moe_config( + num_experts=w2.shape[0], + hidden_dim=w2.shape[1], + intermediate_size_per_partition=w2.shape[2], + in_dtype=a.dtype, + ) - fn = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), + fn = mk.FusedMoEKernel( + maybe_make_prepare_finalize( + moe=moe_config, + quant_config=quant_config, + allow_new_interface=True, + use_monolithic=False, + ), CutlassExpertsFp8( - moe_config=make_dummy_moe_config( - num_experts=w2.shape[0], - hidden_dim=w2.shape[1], - intermediate_size_per_partition=w2.shape[2], - in_dtype=a.dtype, - ), + moe_config=moe_config, quant_config=quant_config, ), ) @@ -163,16 +169,22 @@ def bench_run( w2_scale=w2_scale, per_act_token_quant=per_act_token, ) + moe_config = make_dummy_moe_config( + num_experts=w2.shape[0], + hidden_dim=w2.shape[1], + intermediate_size_per_partition=w2.shape[2], + in_dtype=a.dtype, + ) - fn = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), + fn = mk.FusedMoEKernel( + maybe_make_prepare_finalize( + moe=moe_config, + quant_config=quant_config, + allow_new_interface=True, + use_monolithic=False, + ), CutlassExpertsFp8( - moe_config=make_dummy_moe_config( - num_experts=w2.shape[0], - hidden_dim=w2.shape[1], - intermediate_size_per_partition=w2.shape[2], - in_dtype=a.dtype, - ), + moe_config=moe_config, quant_config=quant_config, ), ) @@ -212,7 +224,7 @@ def bench_run( def replay_graph(graph, num_repeats): for _ in range(num_repeats): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() cutlass_stream = torch.cuda.Stream() cutlass_graph = torch.cuda.CUDAGraph() @@ -227,7 +239,7 @@ def bench_run( topk_weights, topk_ids, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() triton_stream = torch.cuda.Stream() triton_graph = torch.cuda.CUDAGraph() @@ -242,7 +254,7 @@ def bench_run( w2_scale, a_scale, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() min_run_time = 5 num_warmup = 5 diff --git a/benchmarks/kernels/bench_int8_gemm.py b/benchmarks/kernels/benchmark_int8_gemm.py similarity index 100% rename from benchmarks/kernels/bench_int8_gemm.py rename to benchmarks/kernels/benchmark_int8_gemm.py diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index 2292d2f87288f267082e7106e20270f6f8e17bbf..a662e3ac49cbada1a239b7f9f9b26d02ddb628c7 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -5,12 +5,14 @@ import time import torch +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.layers.layernorm import RMSNorm from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed @torch.inference_mode() +@default_vllm_config() def main( num_tokens: int, hidden_size: int, @@ -32,14 +34,14 @@ def main( residual = torch.randn_like(x) * scale if add_residual else None def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: - torch.cuda.synchronize() + torch.accelerator.synchronize() if profile: torch.cuda.cudart().cudaProfilerStart() start_time = time.perf_counter() for _ in range(num_iters): layer(x, residual) - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.perf_counter() if profile: diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 8ca3cf78f0fb22bee49becc5f4325398930a0c04..ab930c59d21937739089c4a4216ff09899ae7a1a 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -1035,7 +1035,7 @@ def bench_optype( # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up for kwargs in kwargs_list: op_type.bench_fn()(**kwargs) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Merge into a single kwargs and qualify arguments as ArgPool kwargs = {k: ArgPool([]) for k in kwargs_list[0]} diff --git a/benchmarks/kernels/benchmark_mla_k_concat.py b/benchmarks/kernels/benchmark_mla_k_concat.py index fb3b6c8f12003e0049dddd4d057c6c31a4aa5dfb..7debf3634804fbf06f76673c767b39ad209ad720 100644 --- a/benchmarks/kernels/benchmark_mla_k_concat.py +++ b/benchmarks/kernels/benchmark_mla_k_concat.py @@ -47,13 +47,13 @@ def benchmark_method( # Warmup for _ in range(num_warmup): _ = method(k_nope, k_pe) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Benchmark start = time.perf_counter() for _ in range(num_iters): _ = method(k_nope, k_pe) - torch.cuda.synchronize() + torch.accelerator.synchronize() end = time.perf_counter() return (end - start) / num_iters * 1000 # Convert to ms diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c35cdb121069a33bd10886ff0f143cf711904f5c..cf49232fd72d6662c9a3858539e4e4fe0eeda8f7 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -16,6 +16,10 @@ import torch from ray.experimental.tqdm_ray import tqdm from vllm.model_executor.layers.fused_moe import fused_topk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation +from vllm.model_executor.layers.fused_moe.all2all_utils import ( + maybe_make_prepare_finalize, +) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -50,7 +54,7 @@ def clear_triton_cache(): # Clear CUDA memory cache if torch.cuda.is_available(): - torch.cuda.empty_cache() + torch.accelerator.empty_cache() # Try to clear Triton's runtime cache try: @@ -99,13 +103,38 @@ def benchmark_config( dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool = False, num_iters: int = 100, block_quant_shape: list[int] = None, use_deep_gemm: bool = False, ) -> float: init_dtype = torch.float16 if use_fp8_w8a8 else dtype x = torch.randn(num_tokens, hidden_size, dtype=dtype) - if use_int8_w8a16: + if use_int4_w4a16: + # Int4 packed weights: 2 int4 values per uint8 byte + # K dimension is packed (halved) + intermediate_size = shard_intermediate_size // 2 # after silu_and_mul + w1 = torch.randint( + 0, + 255, + ( + num_experts, + shard_intermediate_size, + hidden_size // 2, # int4 packing + ), + dtype=torch.uint8, + ) + w2 = torch.randint( + 0, + 255, + ( + num_experts, + hidden_size, + intermediate_size // 2, # int4 packing + ), + dtype=torch.uint8, + ) + elif use_int8_w8a16: w1 = torch.randint( -127, 127, @@ -139,7 +168,20 @@ def benchmark_config( w2_scale = None a1_scale = None a2_scale = None - if use_int8_w8a16: + if use_int4_w4a16: + if block_quant_shape is None: + raise ValueError("block_quant_shape is required for int4_w4a16") + group_size = block_quant_shape[1] + # Scales shape: (E, N, K // group_size) in fp16 + w1_scale = torch.rand( + (num_experts, shard_intermediate_size, hidden_size // group_size), + dtype=dtype, + ) + w2_scale = torch.rand( + (num_experts, hidden_size, intermediate_size // group_size), + dtype=dtype, + ) + elif use_int8_w8a16: w1_scale = torch.randn( (num_experts, 2 * shard_intermediate_size), dtype=torch.float32 ) @@ -198,27 +240,38 @@ def benchmark_config( a1_scale=a1_scale, a2_scale=a2_scale, block_shape=block_quant_shape, + weight_dtype="int4" if use_int4_w4a16 else None, ) deep_gemm_experts = None if use_deep_gemm: - deep_gemm_experts = mk.FusedMoEModularKernel( - prepare_finalize=MoEPrepareAndFinalizeNoEP(), + moe_config = ( + FusedMoEConfig( + num_experts=num_experts, + experts_per_token=topk, + hidden_dim=hidden_size, + intermediate_size_per_partition=shard_intermediate_size, + num_local_experts=num_experts, + num_logical_experts=num_experts, + activation=MoEActivation.SILU, + moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), + in_dtype=init_dtype, + routing_method=RoutingMethodType.TopK, + device="cuda", + ), + ) + deep_gemm_experts = mk.FusedMoEKernel( + prepare_finalize=maybe_make_prepare_finalize( + moe=moe_config, + quant_config=quant_config, + allow_new_interface=True, + use_monolithic=False, + ), fused_experts=TritonOrDeepGemmExperts( - moe_config=FusedMoEConfig( - num_experts=num_experts, - experts_per_token=topk, - hidden_dim=hidden_size, - intermediate_size_per_partition=shard_intermediate_size, - num_local_experts=num_experts, - activation="silu", - moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), - in_dtype=init_dtype, - routing_method=RoutingMethodType.TopK, - device="cuda", - ), + moe_config=moe_config, quant_config=quant_config, ), + inplace=not disable_inplace(), ) with override_config(config): @@ -226,9 +279,18 @@ def benchmark_config( x, input_gating, topk, renormalize=not use_deep_gemm ) + inplace = not disable_inplace() if use_deep_gemm: - return deep_gemm_experts( - x, w1, w2, topk_weights, topk_ids, inplace=True + return deep_gemm_experts.apply( + x, + w1, + w2, + topk_weights, + topk_ids, + activation=MoEActivation.SILU, + global_num_experts=num_experts, + apply_router_weight_on_input=False, + expert_map=False, ) return fused_experts( x, @@ -236,25 +298,25 @@ def benchmark_config( w2, topk_weights, topk_ids, - inplace=True, + inplace=inplace, quant_config=quant_config, ) # JIT compilation & warmup run() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Capture 10 invocations with CUDA graph graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph): for _ in range(10): run() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Warmup for _ in range(5): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event = torch.Event(enable_timing=True) end_event = torch.Event(enable_timing=True) @@ -262,7 +324,7 @@ def benchmark_config( latencies: list[float] = [] for i in range(num_iters): prepare(i) - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event.record() graph.replay() @@ -478,6 +540,7 @@ class BenchmarkWorker: dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool = False, block_quant_shape: list[int] = None, use_deep_gemm: bool = False, ) -> tuple[dict[str, int], float]: @@ -485,7 +548,10 @@ class BenchmarkWorker: set_random_seed(self.seed) dtype_str = _get_config_dtype_str( - dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 + dtype, + use_int8_w8a16=use_int8_w8a16, + use_fp8_w8a8=use_fp8_w8a8, + use_int4_w4a16=use_int4_w4a16, ) # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. @@ -516,6 +582,7 @@ class BenchmarkWorker: dtype, use_fp8_w8a8, use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, num_iters=100, block_quant_shape=block_quant_shape, use_deep_gemm=use_deep_gemm, @@ -532,6 +599,7 @@ class BenchmarkWorker: dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool, search_space: list[dict[str, int]], block_quant_shape: list[int], use_deep_gemm: bool, @@ -542,7 +610,7 @@ class BenchmarkWorker: best_config = None best_time = float("inf") if current_platform.is_rocm(): - is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) + is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16) search_space = prune_rocm_search_space( num_tokens, shard_intermediate_size, @@ -558,7 +626,11 @@ class BenchmarkWorker: if visible_device != f"{self.device_id}": need_device_guard = True - with torch.cuda.device(self.device_id) if need_device_guard else nullcontext(): + with ( + torch.accelerator.device_index(self.device_id) + if need_device_guard + else nullcontext() + ): for idx, config in enumerate(tqdm(search_space)): try: kernel_time = benchmark_config( @@ -571,6 +643,7 @@ class BenchmarkWorker: dtype, use_fp8_w8a8, use_int8_w8a16, + use_int4_w4a16, num_iters=20, block_quant_shape=block_quant_shape, use_deep_gemm=use_deep_gemm, @@ -618,6 +691,7 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: else {} ), **({"kpack": config["kpack"]} if "kpack" in config else {}), + **({"SPLIT_K": config["SPLIT_K"]} if "SPLIT_K" in config else {}), } @@ -630,11 +704,15 @@ def save_configs( dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, + use_int4_w4a16: bool, block_quant_shape: list[int], save_dir: str, ) -> None: dtype_str = _get_config_dtype_str( - dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 + dtype, + use_int8_w8a16=use_int8_w8a16, + use_fp8_w8a8=use_fp8_w8a8, + use_int4_w4a16=use_int4_w4a16, ) # NOTE(woosuk): The current naming convention uses w2.shape[2], which @@ -736,6 +814,38 @@ def get_model_params(config): return E, topk, intermediate_size, hidden_size +def get_quantization_group_size(config) -> int | None: + """Extract the quantization group size from the HF model config. + + This reads directly from the HuggingFace config object (as returned by + ``get_config()``), not from vLLM's quantization config classes. + + Supports AWQ/GPTQ-style configs (direct 'group_size' key) and + compressed-tensors configs (nested inside 'config_groups'). + """ + quantization_config = getattr(config, "quantization_config", {}) + if not isinstance(quantization_config, dict): + return None + # AWQ / GPTQ style: group_size is a top-level key + gs = quantization_config.get("group_size") + if gs is not None: + return gs + # compressed-tensors style: group_size is nested in config_groups + config_groups = quantization_config.get("config_groups", {}) + if not isinstance(config_groups, dict): + return None + for group_cfg in config_groups.values(): + if not isinstance(group_cfg, dict): + continue + weights = group_cfg.get("weights", {}) + if not isinstance(weights, dict): + continue + gs = weights.get("group_size") + if gs is not None: + return gs + return None + + def main(args: argparse.Namespace): print(args) @@ -754,7 +864,20 @@ def main(args: argparse.Namespace): dtype = torch.float16 if current_platform.is_rocm() else config.dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" + use_int4_w4a16 = args.dtype == "int4_w4a16" block_quant_shape = get_weight_block_size_safety(config) + if use_int4_w4a16: + group_size = get_quantization_group_size(config) + if group_size is None: + raise ValueError( + "Could not determine group_size from model config. " + "The model's quantization_config must contain a 'group_size' " + "field (AWQ/GPTQ) or 'config_groups.*.weights.group_size' " + "(compressed-tensors)." + ) + # For int4_w4a16, block_shape = [0, group_size] + # block_shape[0]=0 means no block quantization on N dimension + block_quant_shape = [0, group_size] if args.batch_size is None: batch_sizes = [ @@ -808,8 +931,20 @@ def main(args: argparse.Namespace): return ray.get(outputs) if args.tune: - is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) - search_space = get_configs_compute_bound(is_fp16, block_quant_shape) + # int4_w4a16 weights are uint8-packed, not fp16; treat like fp8 for + # search space generation (no matrix_instr_nonkdim/kpack exploration). + is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16) + # For int4_w4a16, the group_size constraint on BLOCK_SIZE_K does not + # apply: the gptq_awq kernel handles arbitrary BLOCK_SIZE_K regardless + # of group_size. Skip block_quant_shape filtering to keep the full + # search space (e.g. BLOCK_SIZE_K=64 with group_size=128). + tune_block_quant_shape = None if use_int4_w4a16 else block_quant_shape + search_space = get_configs_compute_bound(is_fp16, tune_block_quant_shape) + if use_int4_w4a16: + # SPLIT_K is a required kernel constexpr for gptq_awq kernel; + # only SPLIT_K=1 is used at runtime, so fix it during tuning. + for cfg in search_space: + cfg["SPLIT_K"] = 1 print(f"Start tuning over {len(search_space)} configurations...") if use_deep_gemm: raise ValueError( @@ -829,6 +964,7 @@ def main(args: argparse.Namespace): dtype, use_fp8_w8a8, use_int8_w8a16, + use_int4_w4a16, search_space, block_quant_shape, use_deep_gemm, @@ -848,6 +984,7 @@ def main(args: argparse.Namespace): dtype, use_fp8_w8a8, use_int8_w8a16, + use_int4_w4a16, block_quant_shape, args.save_dir, ) @@ -866,6 +1003,7 @@ def main(args: argparse.Namespace): dtype, use_fp8_w8a8, use_int8_w8a16, + use_int4_w4a16, block_quant_shape, use_deep_gemm, ) @@ -888,7 +1026,10 @@ if __name__ == "__main__": ) parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true") parser.add_argument( - "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" + "--dtype", + type=str, + choices=["auto", "fp8_w8a8", "int8_w8a16", "int4_w4a16"], + default="auto", ) parser.add_argument("--use-deep-gemm", action="store_true") parser.add_argument( diff --git a/benchmarks/kernels/benchmark_moe_defaults.py b/benchmarks/kernels/benchmark_moe_defaults.py new file mode 100644 index 0000000000000000000000000000000000000000..f6ad59366dca7bb266416fbb2168592d45173bed --- /dev/null +++ b/benchmarks/kernels/benchmark_moe_defaults.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Benchmark comparing old vs new default fused MoE configs. + +Runs the triton fused_moe kernel with three configurations for each scenario: + 1. Tuned config (from JSON file, if available) — the target to match + 2. Old default (the hardcoded defaults before this change) + 3. New default (the improved defaults) + +Usage: + python benchmarks/kernels/benchmark_moe_defaults.py + +Produces a table showing kernel time (us) and speedup of new vs old defaults. +""" + +import torch + +from vllm.model_executor.layers.fused_moe import fused_topk, override_config +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.fused_moe import ( + fused_experts, + get_default_config, + get_moe_configs, +) +from vllm.platforms import current_platform +from vllm.triton_utils import triton +from vllm.utils.torch_utils import set_random_seed + +FP8_DTYPE = current_platform.fp8_dtype() + + +def old_default_config(M, E, N, K, topk, dtype=None, block_shape=None): + """The original defaults before https://github.com/vllm-project/vllm/pull/34846, + for comparison.""" + if dtype == "fp8_w8a8" and block_shape is not None: + return { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_shape[0], + "BLOCK_SIZE_K": block_shape[1], + "GROUP_SIZE_M": 32, + "SPLIT_K": 1, + "num_warps": 4, + "num_stages": 3 if not current_platform.is_rocm() else 2, + } + elif M <= E: + return { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "SPLIT_K": 1, + } + else: + return { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 32, + "GROUP_SIZE_M": 8, + "SPLIT_K": 1, + } + + +def benchmark_config( + config, + M, + E, + N, + K, + topk, + dtype, + use_fp8=False, + block_shape=None, + num_iters=100, +): + """Time a single kernel config. Returns kernel time in microseconds.""" + init_dtype = torch.float16 if use_fp8 else dtype + + a = torch.randn(M, K, device="cuda", dtype=init_dtype) / 10 + w1 = torch.randn(E, 2 * N, K, device="cuda", dtype=init_dtype) / 10 + w2 = torch.randn(E, K, N, device="cuda", dtype=init_dtype) / 10 + + w1_scale = None + w2_scale = None + a1_scale = None + a2_scale = None + if use_fp8: + if block_shape is not None: + bsn, bsk = block_shape + n_tiles_w1 = triton.cdiv(2 * N, bsn) + k_tiles_w1 = triton.cdiv(K, bsk) + n_tiles_w2 = triton.cdiv(K, bsn) + k_tiles_w2 = triton.cdiv(N, bsk) + w1_scale = torch.rand( + E, n_tiles_w1, k_tiles_w1, device="cuda", dtype=torch.float32 + ) + w2_scale = torch.rand( + E, n_tiles_w2, k_tiles_w2, device="cuda", dtype=torch.float32 + ) + else: + w1_scale = torch.rand(E, device="cuda", dtype=torch.float32) + w2_scale = torch.rand(E, device="cuda", dtype=torch.float32) + a1_scale = torch.rand(1, device="cuda", dtype=torch.float32) + a2_scale = torch.rand(1, device="cuda", dtype=torch.float32) + # Only weights are stored in fp8; activations stay in bf16/fp16 + # and get dynamically quantized inside the kernel. + w1 = w1.to(FP8_DTYPE) + w2 = w2.to(FP8_DTYPE) + + quant_config = FusedMoEQuantConfig.make( + quant_dtype=torch.float8_e4m3fn if use_fp8 else None, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + block_shape=block_shape, + ) + + gating = torch.randn(M, E, device="cuda", dtype=torch.float32) + + # Warmup + for _ in range(20): + with override_config(config): + topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True) + fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + quant_config=quant_config, + ) + torch.accelerator.synchronize() + + # Benchmark + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + for _ in range(num_iters): + with override_config(config): + topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True) + fused_experts( + a, + w1, + w2, + topk_weights, + topk_ids, + quant_config=quant_config, + ) + end.record() + torch.accelerator.synchronize() + return start.elapsed_time(end) / num_iters * 1000 # ms -> us + + +# Model configurations: (name, E, N, K, topk, dtype_str, use_fp8, block_shape) +# N = moe_intermediate_size // tp_size (the value used in config file lookup) +MODELS = [ + # --- Few experts --- + ("Mixtral bf16", 8, 7168, 4096, 2, None, False, None), + ("Mixtral fp8", 8, 7168, 4096, 2, "fp8_w8a8", True, None), + # --- Many experts: real model shapes at tp=1 --- + # Qwen2-MoE-57B: E=60, topk=4, N=1408, K=2048 + ("Qwen2-MoE bf16", 60, 1408, 2048, 4, None, False, None), + # DeepSeek-V2: E=64, topk=6, N=1407, K=4096 + # (use 1408 to avoid odd alignment; real model is 1407) + ("DeepSeek-V2 bf16", 64, 1408, 4096, 6, None, False, None), + # OLMoE-7B: E=64, topk=8, N=2048, K=2048 + ("OLMoE bf16", 64, 2048, 2048, 8, None, False, None), + # GLM-4-100B-A10B: E=128, topk=8, N=1408, K=4096 + ("GLM-4-MoE bf16", 128, 1408, 4096, 8, None, False, None), + # Qwen3-30B-A3B: E=128, topk=8, N=768, K=2048 + ("Qwen3-MoE bf16", 128, 768, 2048, 8, None, False, None), + # DeepSeek-V3 / MiMo-V2-Flash: E=256, topk=8, N=2048, K=7168 + ("DeepSeek-V3 bf16", 256, 2048, 7168, 8, None, False, None), + # Qwen3.5-70B-A22B (Qwen3-Next): E=512, topk=10, N=512, K=2048 + ("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None), + # E=128 N=1856 bf16 + ("E128 N1856 bf16", 128, 1856, 4096, 8, None, False, None), + # E=256 N=512 bf16 (DS-V3 tp=4) + ("DS-V3 tp4 bf16", 256, 512, 7168, 8, None, False, None), + # E=512 N=512 bf16 (Qwen3-Next tp=1) + ("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None), + # E=512 N=256 bf16 (Qwen3-Next tp=2) + ("Qwen3-Next tp2", 512, 256, 2048, 10, None, False, None), + # --- FP8 block quant (many experts) --- + # DS-V3 tp=4: E=256, N=512, fp8 block + ("DS-V3 tp4 fp8blk", 256, 512, 7168, 8, "fp8_w8a8", True, [128, 128]), + # DS-V3 tp=8: E=256, N=256, fp8 block + ("DS-V3 tp8 fp8blk", 256, 256, 7168, 8, "fp8_w8a8", True, [128, 128]), + # Qwen3-Next tp=2 fp8 block + ("Qwen3-Next tp2 fp8blk", 512, 256, 2048, 10, "fp8_w8a8", True, [128, 128]), +] + +BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096] + + +def main(): + set_random_seed(0) + torch.set_default_device("cuda") + dtype = torch.bfloat16 + + for name, E, N, K, topk, dtype_str, use_fp8, block_shape in MODELS: + print(f"\n{'=' * 90}") + print(f" {name} (E={E}, N={N}, K={K}, topk={topk})") + print(f"{'=' * 90}") + + # Try to load tuned config + block_n = block_shape[0] if block_shape else None + block_k = block_shape[1] if block_shape else None + tuned = get_moe_configs(E, N, dtype_str, block_n, block_k) + has_tuned = tuned is not None + print(f" Tuned config available: {has_tuned}") + + hdr = ( + f"{'Batch':>6} | {'Tuned (us)':>11} | {'Old (us)':>11} | " + f"{'New (us)':>11} | {'New/Old':>8} | {'New/Tuned':>10}" + ) + print(f" {hdr}") + print(f" {'-' * len(hdr)}") + + for M in BATCH_SIZES: + old_cfg = old_default_config(M, E, N, K, topk, dtype_str, block_shape) + new_cfg = get_default_config(M, E, N, K, topk, dtype_str, block_shape) + + if has_tuned: + tuned_cfg = tuned[min(tuned.keys(), key=lambda x: abs(x - M))] + t_tuned = benchmark_config( + tuned_cfg, + M, + E, + N, + K, + topk, + dtype, + use_fp8=use_fp8, + block_shape=block_shape, + ) + else: + t_tuned = None + + t_old = benchmark_config( + old_cfg, + M, + E, + N, + K, + topk, + dtype, + use_fp8=use_fp8, + block_shape=block_shape, + ) + t_new = benchmark_config( + new_cfg, + M, + E, + N, + K, + topk, + dtype, + use_fp8=use_fp8, + block_shape=block_shape, + ) + + ratio_new_old = t_new / t_old + tuned_str = f"{t_tuned:11.2f}" if t_tuned else f"{'N/A':>11}" + ratio_tuned = f"{t_new / t_tuned:10.2f}x" if t_tuned else f"{'N/A':>10}" + # flag regressions where new default is >5% slower than old + marker = " <--" if ratio_new_old > 1.05 else "" + + print( + f" {M:>6} | {tuned_str} | {t_old:11.2f} | {t_new:11.2f} " + f"| {ratio_new_old:7.2f}x | {ratio_tuned}{marker}" + ) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 9c386a9b895b2e97ad5bca4b587ce42a1df9827a..f93e66f0e12c8d5603baca8043fe2ccb77cf1f06 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -72,19 +72,19 @@ def benchmark_permute( # JIT compilation & warmup run() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Capture 10 invocations with CUDA graph graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph): for _ in range(10): run() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Warmup for _ in range(5): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event = torch.Event(enable_timing=True) end_event = torch.Event(enable_timing=True) @@ -92,7 +92,7 @@ def benchmark_permute( latencies: list[float] = [] for i in range(num_iters): prepare(i) - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event.record() graph.replay() @@ -185,26 +185,26 @@ def benchmark_unpermute( # JIT compilation & warmup input = prepare() run(input) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Capture 10 invocations with CUDA graph graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph): for _ in range(10): run(input) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Warmup for _ in range(5): graph.replay() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event = torch.Event(enable_timing=True) end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event.record() graph.replay() end_event.record() diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index 3e03651357784bfd3c1d539a3eeafd76c54d311a..6548c74f808920264fcff6200f61d9fda2bea77f 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -36,6 +36,7 @@ from typing import Any import numpy as np import torch +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.transformers_utils.config import get_config from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -78,6 +79,7 @@ def calculate_stats(times: list[float]) -> dict[str, float]: } +@default_vllm_config() def benchmark_mrope( model_name: str, num_tokens: int, @@ -133,14 +135,14 @@ def benchmark_mrope( key.clone(), ) - torch.cuda.synchronize() + torch.accelerator.synchronize() # Time reference implementation torch_times = [] for _ in range(benchmark_iter): query_clone = query.clone() key_clone = key.clone() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.time() mrope_helper_class.forward_native( @@ -149,7 +151,7 @@ def benchmark_mrope( key_clone, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() torch_times.append(time.time() - start_time) # Time triton kernel implementation @@ -157,14 +159,14 @@ def benchmark_mrope( for _ in range(benchmark_iter): query_clone = query.clone() key_clone = key.clone() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_time = time.time() mrope_helper_class.forward_cuda( positions, query_clone, key_clone, ) - torch.cuda.synchronize() + torch.accelerator.synchronize() triton_times.append(time.time() - start_time) # Calculate statistics diff --git a/benchmarks/kernels/bench_mxfp4_qutlass.py b/benchmarks/kernels/benchmark_mxfp4_qutlass.py similarity index 100% rename from benchmarks/kernels/bench_mxfp4_qutlass.py rename to benchmarks/kernels/benchmark_mxfp4_qutlass.py diff --git a/benchmarks/kernels/bench_nvfp4_gemm.py b/benchmarks/kernels/benchmark_nvfp4_gemm.py similarity index 100% rename from benchmarks/kernels/bench_nvfp4_gemm.py rename to benchmarks/kernels/benchmark_nvfp4_gemm.py diff --git a/benchmarks/kernels/bench_nvfp4_quant.py b/benchmarks/kernels/benchmark_nvfp4_quant.py similarity index 100% rename from benchmarks/kernels/bench_nvfp4_quant.py rename to benchmarks/kernels/benchmark_nvfp4_quant.py diff --git a/benchmarks/kernels/bench_nvfp4_qutlass.py b/benchmarks/kernels/benchmark_nvfp4_qutlass.py similarity index 100% rename from benchmarks/kernels/bench_nvfp4_qutlass.py rename to benchmarks/kernels/benchmark_nvfp4_qutlass.py diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index be871d3d1aa082b510748c46f4a08ae94579237c..b6a0b7ad8cacd0f2814da4f0e85520dc7cd3b4f9 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -103,7 +103,7 @@ def main( max_logits = torch.empty_like(exp_sums) def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: - torch.cuda.synchronize() + torch.accelerator.synchronize() if profile: torch.cuda.cudart().cudaProfilerStart() start_time = time.perf_counter() @@ -173,7 +173,7 @@ def main( ) else: raise ValueError(f"Invalid version: {version}") - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.perf_counter() if profile: diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py index eba4d510258b67ba22e59d3000a1516048ba71b1..f2195a6d780b5d367b875b86f5a7d57f8c9f5d83 100644 --- a/benchmarks/kernels/benchmark_per_token_group_quant.py +++ b/benchmarks/kernels/benchmark_per_token_group_quant.py @@ -28,7 +28,7 @@ def _time_cuda( # warmup for _ in range(warmup_iters): fn() - torch.cuda.synchronize() + torch.accelerator.synchronize() start = torch.Event(enable_timing=True) end = torch.Event(enable_timing=True) @@ -37,7 +37,7 @@ def _time_cuda( for _ in range(bench_iters): fn() end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() return start.elapsed_time(end) / bench_iters # ms/iter diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/benchmark_per_token_quant_fp8.py similarity index 99% rename from benchmarks/kernels/bench_per_token_quant_fp8.py rename to benchmarks/kernels/benchmark_per_token_quant_fp8.py index 7792cfd03b0e49022d7365b9a96e8e41ad236a99..6ce97e30368b735a5c860c9d7549ffbb42e610e8 100644 --- a/benchmarks/kernels/bench_per_token_quant_fp8.py +++ b/benchmarks/kernels/benchmark_per_token_quant_fp8.py @@ -7,6 +7,7 @@ from unittest.mock import patch import pandas as pd import torch +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.triton_utils import triton @@ -84,6 +85,7 @@ def calculate_diff( configs = [] +@default_vllm_config() def benchmark_quantization( batch_size, hidden_size, diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 9a21cfe94e5be1d69114fe049a6f8167eaf36592..d01c7ac37c5387f11b8c4f471403a08238b2e686 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -29,7 +29,7 @@ def main( scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: - torch.cuda.synchronize() + torch.accelerator.synchronize() if profile: torch.cuda.cudart().cudaProfilerStart() start_time = time.perf_counter() @@ -39,7 +39,7 @@ def main( ops.scaled_int8_quant(x, scale) else: ops.scaled_fp8_quant(x, scale) - torch.cuda.synchronize() + torch.accelerator.synchronize() end_time = time.perf_counter() if profile: diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py index 99067d8ac3710fc7f86dcd3017b3a8ea218426de..97af4ac976ee48287dc1f3cbcc1876f9d96b3d5b 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache.py @@ -84,16 +84,16 @@ def run_benchmark( g = torch.cuda.CUDAGraph() with torch.cuda.graph(g): function_under_test() - torch.cuda.synchronize() + torch.accelerator.synchronize() function_under_test = lambda: g.replay() def run_cuda_benchmark(n_iters: int) -> float: nonlocal key, value, key_cache, value_cache, slot_mapping - torch.cuda.synchronize() + torch.accelerator.synchronize() start = time.perf_counter() for _ in range(n_iters): function_under_test() - torch.cuda.synchronize() + torch.accelerator.synchronize() end = time.perf_counter() return (end - start) / n_iters @@ -104,7 +104,7 @@ def run_benchmark( # free tensors to mitigate OOM when sweeping del key, value, key_cache, value_cache, slot_mapping - torch.cuda.empty_cache() + torch.accelerator.empty_cache() return lat diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py index ef6be1f3c3597c9d4922b6bba8ad4128fecfbd0a..55c203725186e75930e80d9fcd0aca1aabb04751 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -109,16 +109,16 @@ def run_benchmark( g = torch.cuda.CUDAGraph() with torch.cuda.graph(g): function_under_test() - torch.cuda.synchronize() + torch.accelerator.synchronize() function_under_test = lambda: g.replay() def run_cuda_benchmark(n_iters: int) -> float: nonlocal key, value, key_cache, value_cache, slot_mapping - torch.cuda.synchronize() + torch.accelerator.synchronize() start = time.perf_counter() for _ in range(n_iters): function_under_test() - torch.cuda.synchronize() + torch.accelerator.synchronize() end = time.perf_counter() return (end - start) / n_iters @@ -129,7 +129,7 @@ def run_benchmark( # free tensors to mitigate OOM when sweeping del key, value, key_cache, value_cache, slot_mapping - torch.cuda.empty_cache() + torch.accelerator.empty_cache() return lat diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 7a1bc050bb33fd8cee8f4d0405572d7af1017bc3..5e1df3b2939abf2a7632c7148d6794bbc6b53167 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -5,6 +5,7 @@ import itertools import torch +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.triton_utils import triton from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -29,6 +30,7 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device): args={}, ) ) + @default_vllm_config() def benchmark(batch_size, seq_len, num_heads, provider): dtype = torch.bfloat16 max_position = 8192 diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py index da32bc30cb2ae3b385b79c852334f1594a4fe52d..13b97b7696b3c7b54df606bd3e421df672d4decb 100644 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -251,7 +251,7 @@ def benchmark( kernel( y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G ) - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event = torch.Event(enable_timing=True) end_event = torch.Event(enable_timing=True) @@ -259,7 +259,7 @@ def benchmark( # Benchmark latencies: list[float] = [] for _ in range(runs): - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event.record() for i in range(iterations_per_run): diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index 1d0d6fbb9a470582773c0eb6fc605a210e180cfc..89970e2b0661ce9cd081eda0360924b8b0a69066 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -126,7 +126,7 @@ def benchmark_decode( ) def time_fn(fn, warmup=10, trials=20): - torch.cuda.synchronize() + torch.accelerator.synchronize() start = torch.Event(enable_timing=True) end = torch.Event(enable_timing=True) times = [] @@ -136,7 +136,7 @@ def benchmark_decode( start.record() fn() end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() times.append(start.elapsed_time(end)) # ms return sum(times) / len(times), torch.std(torch.tensor(times)) diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 84bde723abf7fa02090c783296092540571845da..6b9d6b7f8318a5c8ccb4261200c911c4f9df967d 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -138,7 +138,7 @@ def benchmark_prefill( ) def time_fn(fn, warmup=10, trials=20): - torch.cuda.synchronize() + torch.accelerator.synchronize() start = torch.Event(enable_timing=True) end = torch.Event(enable_timing=True) times = [] @@ -148,7 +148,7 @@ def benchmark_prefill( start.record() fn() end.record() - torch.cuda.synchronize() + torch.accelerator.synchronize() times.append(start.elapsed_time(end)) # ms return sum(times) / len(times), torch.std(torch.tensor(times)) diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 3a85c5c74d6932ab4403a04bb7a546a49e79314e..36dce1b6388a4e836ddb68452f81328b51b334ff 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -177,18 +177,18 @@ def benchmark_config( def run(): w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype) - torch.cuda.synchronize() + torch.accelerator.synchronize() # JIT complication & warmup for _ in range(5): run() - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event = torch.Event(enable_timing=True) end_event = torch.Event(enable_timing=True) latencies: list[float] = [] for i in range(num_iters): - torch.cuda.synchronize() + torch.accelerator.synchronize() start_event.record() run() end_event.record() @@ -285,7 +285,7 @@ def tune_on_gpu(args_dict): weight_shapes = args_dict["weight_shapes"] args = args_dict["args"] - torch.cuda.set_device(gpu_id) + torch.accelerator.set_device_index(gpu_id) print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}") block_n = args.block_n @@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus): def main(args): print(args) - num_gpus = torch.cuda.device_count() + num_gpus = torch.accelerator.device_count() if num_gpus == 0: raise RuntimeError("No GPU available for tuning") print(f"Found {num_gpus} GPUs for parallel tuning") diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index 5a85526a151e56e680e95fc1d8599c4a335002cd..4384d3e56828e309050570458f0037e5cb226f66 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -35,7 +35,7 @@ def benchmark_shape( B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16) # Reference result in BF16 - torch.cuda.synchronize() + torch.accelerator.synchronize() C_ref = A @ B.t() # Pre-quantize B for all implementations @@ -121,14 +121,14 @@ def benchmark_shape( # Warmup for _ in range(warmup): func() - torch.cuda.synchronize() + torch.accelerator.synchronize() # Timing loop - torch.cuda.synchronize() + torch.accelerator.synchronize() start = time.time() for _ in range(repeat): func() - torch.cuda.synchronize() + torch.accelerator.synchronize() end = time.time() # Calculate timing and TFLOPS diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md index b0be1e3a69a66a403df20f0b8beac9916aff8a02..fa3fa0513e8f2221378ecf09531aed4f5b99b3a4 100644 --- a/benchmarks/multi_turn/README.md +++ b/benchmarks/multi_turn/README.md @@ -7,7 +7,7 @@ First start serving your model ```bash export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/ -vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests +vllm serve $MODEL_PATH --served-model-name Llama ``` The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface). diff --git a/benchmarks/run_structured_output_benchmark.sh b/benchmarks/run_structured_output_benchmark.sh index b043ab83e4608e7b27e2f94e6ec24a05f7c474aa..bc40ed83f438c69212feda8207f63fa000100121 100755 --- a/benchmarks/run_structured_output_benchmark.sh +++ b/benchmarks/run_structured_output_benchmark.sh @@ -71,7 +71,7 @@ while [[ $# -gt 0 ]]; do usage ;; *) - echo "Unknown argument: $1\n" + printf "Unknown argument: %s\n" "$1" usage ;; esac @@ -84,15 +84,17 @@ mkdir -p "$OUTPUT_DIR" QPS_VALUES=(25 20 15 10 5 1) # Common parameters -COMMON_PARAMS="--backend $BACKEND \ - --model $MODEL \ - --dataset $DATASET \ - --structured-output-ratio $STRUCTURED_OUTPUT_RATIO \ - --save-results \ - --result-dir $OUTPUT_DIR \ - --output-len $MAX_NEW_TOKENS \ - --port $PORT \ - --tokenizer-mode $TOKENIZER_MODE" +COMMON_PARAMS=( + --backend "$BACKEND" + --model "$MODEL" + --dataset "$DATASET" + --structured-output-ratio "$STRUCTURED_OUTPUT_RATIO" + --save-results + --result-dir "$OUTPUT_DIR" + --output-len "$MAX_NEW_TOKENS" + --port "$PORT" + --tokenizer-mode "$TOKENIZER_MODE" +) echo "Starting structured output benchmark with model: $MODEL" echo "Backend: $BACKEND" @@ -109,17 +111,17 @@ for qps in "${QPS_VALUES[@]}"; do GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown") # Construct filename for this run - FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json" + FILENAME="${BACKEND}_${qps}qps_$(basename "$MODEL")_${DATASET}_${GIT_HASH}_${GIT_BRANCH}.json" NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc) NUM_PROMPTS=${NUM_PROMPTS%.*} # Remove fractional part echo "Running benchmark with $NUM_PROMPTS prompts" # Run the benchmark - python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \ - --request-rate $qps \ + python "$SCRIPT_DIR/benchmark_serving_structured_output.py" "${COMMON_PARAMS[@]}" \ + --request-rate "$qps" \ --result-filename "$FILENAME" \ - --num-prompts $NUM_PROMPTS + --num-prompts "$NUM_PROMPTS" echo "Completed benchmark with QPS: $qps" echo "----------------------------------------" diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index c9813a73d91add696fa9fba0061efa3b740ec1b6..8d74d6d5d96c38179158f6737c014c405028edb2 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -13,27 +13,16 @@ endif() # # Define environment variables for special configurations # -set(ENABLE_AVX2 $ENV{VLLM_CPU_AVX2}) -set(ENABLE_AVX512 $ENV{VLLM_CPU_AVX512}) -set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16}) -set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI}) -set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16}) +set(ENABLE_X86_ISA $ENV{VLLM_CPU_X86}) +set(ENABLE_ARM_BF16 $ENV{VLLM_CPU_ARM_BF16}) include_directories("${CMAKE_SOURCE_DIR}/csrc") - set (ENABLE_NUMA TRUE) # # Check the compile flags # - -if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") - list(APPEND CXX_COMPILE_FLAGS - "-mf16c" - ) -endif() - if(MACOSX_FOUND) list(APPEND CXX_COMPILE_FLAGS "-DVLLM_CPU_EXTENSION") @@ -77,18 +66,6 @@ function(check_sysctl TARGET OUT) endif() endfunction() - -function (is_avx512_disabled OUT) - set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512}) - if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true") - set(${OUT} ON PARENT_SCOPE) - else() - set(${OUT} OFF PARENT_SCOPE) - endif() -endfunction() - -is_avx512_disabled(AVX512_DISABLED) - if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") message(STATUS "Apple Silicon Detected") set(APPLE_SILICON_FOUND TRUE) @@ -96,84 +73,44 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") check_sysctl(hw.optional.neon ASIMD_FOUND) check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND) else() - find_isa(${CPUINFO} "avx2" AVX2_FOUND) - find_isa(${CPUINFO} "avx512f" AVX512_FOUND) find_isa(${CPUINFO} "Power11" POWER11_FOUND) find_isa(${CPUINFO} "POWER10" POWER10_FOUND) find_isa(${CPUINFO} "POWER9" POWER9_FOUND) find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support find_isa(${CPUINFO} "S390" S390_FOUND) - find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support + find_isa(${CPUINFO} "zvfhmin" RVV_FP16_FOUND) # Check for RISC-V Vector FP16 support + find_isa(${CPUINFO} "zvfbfmin" RVV_BF16_FOUND) # Check for RISC-V Vector BF16 support # Support cross-compilation by allowing override via environment variables - if (ENABLE_AVX2) - set(AVX2_FOUND ON) - message(STATUS "AVX2 support enabled via VLLM_CPU_AVX2 environment variable") - endif() - if (ENABLE_AVX512) - set(AVX512_FOUND ON) - message(STATUS "AVX512 support enabled via VLLM_CPU_AVX512 environment variable") + if (ENABLE_ARM_BF16) + set(ARM_BF16_FOUND ON) + message(STATUS "ARM BF16 support enabled via VLLM_CPU_ARM_BF16 environment variable") endif() endif() -if (AVX512_FOUND AND NOT AVX512_DISABLED) - list(APPEND CXX_COMPILE_FLAGS +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA) + set(ENABLE_X86_ISA ON) + if (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND + CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)) + message(FATAL_ERROR "X86 backend requires gcc/g++ >= 12.3") + endif() + list(APPEND CXX_COMPILE_FLAGS "-mf16c") + list(APPEND CXX_COMPILE_FLAGS_AVX512 ${CXX_COMPILE_FLAGS}) + list(APPEND CXX_COMPILE_FLAGS_AVX2 ${CXX_COMPILE_FLAGS}) + list(APPEND CXX_COMPILE_FLAGS_AVX512 "-mavx512f" "-mavx512vl" "-mavx512bw" "-mavx512dq") - - find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND) - if (AVX512BF16_FOUND OR ENABLE_AVX512BF16) - if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND - CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) - list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16") - set(ENABLE_AVX512BF16 ON) - else() - set(ENABLE_AVX512BF16 OFF) - message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3") - endif() - else() - set(ENABLE_AVX512BF16 OFF) - message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.") - endif() - - find_isa(${CPUINFO} "avx512_vnni" AVX512VNNI_FOUND) - if (AVX512VNNI_FOUND OR ENABLE_AVX512VNNI) - if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND - CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) - list(APPEND CXX_COMPILE_FLAGS "-mavx512vnni") - set(ENABLE_AVX512VNNI ON) - else() - set(ENABLE_AVX512VNNI OFF) - message(WARNING "Disable AVX512-VNNI ISA support, requires gcc/g++ >= 12.3") - endif() - else() - set(ENABLE_AVX512VNNI OFF) - message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.") - endif() - - find_isa(${CPUINFO} "amx_bf16" AMXBF16_FOUND) - if (AMXBF16_FOUND OR ENABLE_AMXBF16) - if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND - CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) - list(APPEND CXX_COMPILE_FLAGS "-mamx-bf16" "-mamx-tile") - set(ENABLE_AMXBF16 ON) - add_compile_definitions(-DCPU_CAPABILITY_AMXBF16) - else() - set(ENABLE_AMXBF16 OFF) - message(WARNING "Disable AMX_BF16 ISA support, requires gcc/g++ >= 12.3") - endif() - else() - set(ENABLE_AMXBF16 OFF) - message(WARNING "Disable AMX_BF16 ISA support, no amx_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AMXBF16=1.") - endif() - -elseif (AVX2_FOUND) - list(APPEND CXX_COMPILE_FLAGS "-mavx2") - message(WARNING "vLLM CPU backend using AVX2 ISA") - + list(APPEND CXX_COMPILE_FLAGS_AVX512_AMX + ${CXX_COMPILE_FLAGS_AVX512} + "-mamx-bf16" + "-mamx-tile" + "-mavx512bf16" + "-mavx512vnni") + list(APPEND CXX_COMPILE_FLAGS_AVX2 + "-mavx2") elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) message(STATUS "PowerPC detected") if (POWER9_FOUND) @@ -208,18 +145,26 @@ elseif (S390_FOUND) "-march=native" "-mtune=native") elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64") - if(RVV_FOUND) - message(FAIL_ERROR "Can't support rvv now.") + message(STATUS "RISC-V detected") + if(RVV_BF16_FOUND) + message(STATUS "BF16 extension detected") + set(MARCH_FLAGS -march=rv64gcv_zvfh_zfbfmin_zvfbfmin_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d) + add_compile_definitions(RISCV_BF16_SUPPORT) + elseif (RVV_FP16_FOUND) + message(WARNING "BF16 functionality is not available") + set(MARCH_FLAGS -march=rv64gcv_zvfh_zvl128b -mrvv-vector-bits=zvl -mabi=lp64d) else() + message(STATUS "compile riscv with scalar") list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc") endif() + list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS}) else() - message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.") + message(FATAL_ERROR "vLLM CPU backend requires X86, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.") endif() -# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms) -if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) +# Build oneDNN for GEMM kernels +if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64 # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "") @@ -308,13 +253,24 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON ) else() message(STATUS "Downloading oneDNN from GitHub") - FetchContent_Declare( - oneDNN - GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.10 - GIT_PROGRESS TRUE - GIT_SHALLOW TRUE - ) + if(ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) + message(STATUS "aarch64 detected: using pinned oneDNN commit 9c5be1cc59e368aebf0909e6cf20f981ea61462a") + FetchContent_Declare( + oneDNN + GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git + GIT_TAG 9c5be1cc59e368aebf0909e6cf20f981ea61462a + GIT_PROGRESS TRUE + GIT_SHALLOW FALSE + ) + else() + FetchContent_Declare( + oneDNN + GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git + GIT_TAG v3.10 + GIT_PROGRESS TRUE + GIT_SHALLOW TRUE + ) + endif() endif() set(ONEDNN_LIBRARY_TYPE "STATIC") @@ -324,13 +280,21 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON set(ONEDNN_ENABLE_WORKLOAD "INFERENCE") set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER") set(ONEDNN_BUILD_GRAPH "OFF") - set(ONEDNN_ENABLE_JIT_PROFILING "OFF") + set(ONEDNN_ENABLE_JIT_PROFILING "ON") set(ONEDNN_ENABLE_ITT_TASKS "OFF") - set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") - set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") - set(ONEDNN_VERBOSE "OFF") + set(ONEDNN_ENABLE_MAX_CPU_ISA "ON") + set(ONEDNN_ENABLE_CPU_ISA_HINTS "ON") + set(ONEDNN_VERBOSE "ON") set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) + # TODO: Refactor this + if (ENABLE_X86_ISA) + # Note: only enable oneDNN for AVX512 + list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}) + else() + list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS}) + endif() + set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE}) set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size FetchContent_MakeAvailable(oneDNN) @@ -343,14 +307,21 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON PRIVATE ${oneDNN_SOURCE_DIR}/src ) target_link_libraries(dnnl_ext dnnl torch) - target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC) + target_compile_options(dnnl_ext PRIVATE ${DNNL_COMPILE_FLAGS} -fPIC) list(APPEND LIBS dnnl_ext) set(USE_ONEDNN ON) else() set(USE_ONEDNN OFF) endif() -message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") +# TODO: Refactor this +if (ENABLE_X86_ISA) + message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) compile flags: ${CXX_COMPILE_FLAGS_AVX512_AMX}") + message(STATUS "CPU extension (AVX512F) compile flags: ${CXX_COMPILE_FLAGS_AVX512}") + message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}") +else() + message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") +endif() if(ENABLE_NUMA) list(APPEND LIBS numa) @@ -385,25 +356,6 @@ set(VLLM_EXT_SRC "csrc/cpu/cpu_attn.cpp" "csrc/cpu/torch_bindings.cpp") -if (AVX512_FOUND AND NOT AVX512_DISABLED) - set(VLLM_EXT_SRC - "csrc/cpu/shm.cpp" - "csrc/cpu/cpu_wna16.cpp" - "csrc/cpu/cpu_fused_moe.cpp" - ${VLLM_EXT_SRC}) - if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI) - set(VLLM_EXT_SRC - "csrc/cpu/sgl-kernels/gemm.cpp" - "csrc/cpu/sgl-kernels/gemm_int8.cpp" - "csrc/cpu/sgl-kernels/gemm_fp8.cpp" - "csrc/cpu/sgl-kernels/moe.cpp" - "csrc/cpu/sgl-kernels/moe_int8.cpp" - "csrc/cpu/sgl-kernels/moe_fp8.cpp" - ${VLLM_EXT_SRC}) - add_compile_definitions(-DCPU_CAPABILITY_AVX512) - endif() -endif() - if (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) set(VLLM_EXT_SRC "csrc/cpu/shm.cpp" @@ -416,21 +368,102 @@ if(USE_ONEDNN) ${VLLM_EXT_SRC}) endif() -message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}") +if (ENABLE_X86_ISA) + set(VLLM_EXT_SRC_SGL + "csrc/cpu/sgl-kernels/gemm.cpp" + "csrc/cpu/sgl-kernels/gemm_int8.cpp" + "csrc/cpu/sgl-kernels/gemm_fp8.cpp" + "csrc/cpu/sgl-kernels/moe.cpp" + "csrc/cpu/sgl-kernels/moe_int8.cpp" + "csrc/cpu/sgl-kernels/moe_fp8.cpp") -# -# Define extension targets -# + set(VLLM_EXT_SRC_AVX512 + "csrc/cpu/shm.cpp" + "csrc/cpu/cpu_wna16.cpp" + "csrc/cpu/cpu_fused_moe.cpp" + "csrc/cpu/utils.cpp" + "csrc/cpu/cpu_attn.cpp" + "csrc/cpu/dnnl_kernels.cpp" + "csrc/cpu/torch_bindings.cpp" + # TODO: Remove these files + "csrc/cpu/activation.cpp" + "csrc/cpu/layernorm.cpp" + "csrc/cpu/mla_decode.cpp" + "csrc/cpu/pos_encoding.cpp" + "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") + + set(VLLM_EXT_SRC_AVX2 + "csrc/cpu/utils.cpp" + "csrc/cpu/cpu_attn.cpp" + "csrc/cpu/torch_bindings.cpp" + # TODO: Remove these files + "csrc/cpu/activation.cpp" + "csrc/cpu/layernorm.cpp" + "csrc/cpu/mla_decode.cpp" + "csrc/cpu/pos_encoding.cpp" + "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") + + message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) source files: ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}") + message(STATUS "CPU extension (AVX512F) source files: ${VLLM_EXT_SRC_AVX512}") + message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}") + + set(_C_LIBS numa dnnl_ext) + set(_C_AVX512_LIBS numa dnnl_ext) + set(_C_AVX2_LIBS numa) + + # AMX + AVX512F + AVX512BF16 + AVX512VNNI + define_extension_target( + _C + DESTINATION vllm + LANGUAGE CXX + SOURCES ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL} + LIBRARIES ${_C_LIBS} + COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512_AMX} + USE_SABI 3 + WITH_SOABI + ) -define_extension_target( - _C - DESTINATION vllm - LANGUAGE CXX - SOURCES ${VLLM_EXT_SRC} - LIBRARIES ${LIBS} - COMPILE_FLAGS ${CXX_COMPILE_FLAGS} - USE_SABI 3 - WITH_SOABI -) + # For AMX kernels + target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16") + + # AVX512F + define_extension_target( + _C_AVX512 + DESTINATION vllm + LANGUAGE CXX + SOURCES ${VLLM_EXT_SRC_AVX512} + LIBRARIES ${_C_AVX512_LIBS} + COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512} + USE_SABI 3 + WITH_SOABI + ) + + # AVX2 + define_extension_target( + _C_AVX2 + DESTINATION vllm + LANGUAGE CXX + SOURCES ${VLLM_EXT_SRC_AVX2} + LIBRARIES ${_C_AVX2_LIBS} + COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2} + USE_SABI 3 + WITH_SOABI + ) +else() + message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}") + # + # Define extension targets + # + define_extension_target( + _C + DESTINATION vllm + LANGUAGE CXX + SOURCES ${VLLM_EXT_SRC} + LIBRARIES ${LIBS} + COMPILE_FLAGS ${CXX_COMPILE_FLAGS} + USE_SABI 3 + WITH_SOABI + ) +endif() message(STATUS "Enabling C extension.") diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake index 90187850f18a234038587c54c31eb410088bc4de..0f16b9161fa3ca17faaad664b344d4a5d623f12e 100644 --- a/cmake/external_projects/flashmla.cmake +++ b/cmake/external_projects/flashmla.cmake @@ -19,7 +19,7 @@ else() FetchContent_Declare( flashmla GIT_REPOSITORY https://github.com/vllm-project/FlashMLA - GIT_TAG c2afa9cb93e674d5a9120a170a6da57b89267208 + GIT_TAG 692917b1cda61b93ac9ee2d846ec54e75afe87b1 GIT_PROGRESS TRUE CONFIGURE_COMMAND "" BUILD_COMMAND "" diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index b51934a3ab29ac76b000ee5c5ba48b56ac86364e..a7e9e6ff5545bacd0fa9b98e8c7321ae12703179 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -17,7 +17,8 @@ endif() # They should be identical but if they aren't, this is a massive footgun. # # The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place. -# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3). +# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2), --component _vllm_fa3_C (for FA3), +# or --component _vllm_fa4_cutedsl_C (for FA4 CuteDSL Python files). # If no component is specified, vllm-flash-attn is still installed. # If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading. @@ -38,22 +39,16 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2 + GIT_TAG 1488682bb545f7d020e958a33116b1419d1cfc83 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn ) endif() - -# Ensure the vllm/vllm_flash_attn directory exists before installation -install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS) - # Make sure vllm-flash-attn install rules are nested under vllm/ -# This is here to support installing all components under the same prefix with cmake --install. -# setup.py installs every component separately but uses the same prefix for all. -# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3, -# and these statements don't hurt when installing neither component. +# ALL_COMPONENTS ensures the save/modify/restore runs exactly once regardless +# of how many components are being installed, avoiding double-append of /vllm/. install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS) install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS) install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS) @@ -62,22 +57,48 @@ install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_ FetchContent_MakeAvailable(vllm-flash-attn) message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") -# Restore the install prefix +# Restore the install prefix after FA's install rules install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS) install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) -# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in -# case only one is built, in the case both are built redundant work is done) -install( - DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm/vllm_flash_attn - COMPONENT _vllm_fa2_C - FILES_MATCHING PATTERN "*.py" -) +# Install shared Python files for both FA2 and FA3 components +foreach(_FA_COMPONENT _vllm_fa2_C _vllm_fa3_C) + # Ensure the vllm/vllm_flash_attn directory exists before installation + install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" + COMPONENT ${_FA_COMPONENT}) + + # Copy vllm_flash_attn python files (except __init__.py and flash_attn_interface.py + # which are source-controlled in vllm) + install( + DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ + DESTINATION vllm/vllm_flash_attn + COMPONENT ${_FA_COMPONENT} + FILES_MATCHING PATTERN "*.py" + PATTERN "__init__.py" EXCLUDE + PATTERN "flash_attn_interface.py" EXCLUDE + ) + +endforeach() -install( - DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm/vllm_flash_attn - COMPONENT _vllm_fa3_C - FILES_MATCHING PATTERN "*.py" -) +# +# FA4 CuteDSL component +# This is a Python-only component that copies the flash_attn/cute directory +# and transforms imports to match our package structure. +# +add_custom_target(_vllm_fa4_cutedsl_C) + +# Copy flash_attn/cute directory (needed for FA4) and transform imports +# The cute directory uses flash_attn.cute imports internally, which we replace +# with vllm.vllm_flash_attn.cute to match our package structure. +install(CODE " + file(GLOB_RECURSE CUTE_PY_FILES \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute/*.py\") + foreach(SRC_FILE \${CUTE_PY_FILES}) + file(RELATIVE_PATH REL_PATH \"${vllm-flash-attn_SOURCE_DIR}/flash_attn/cute\" \${SRC_FILE}) + set(DST_FILE \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn/cute/\${REL_PATH}\") + get_filename_component(DST_DIR \${DST_FILE} DIRECTORY) + file(MAKE_DIRECTORY \${DST_DIR}) + file(READ \${SRC_FILE} FILE_CONTENTS) + string(REPLACE \"flash_attn.cute\" \"vllm.vllm_flash_attn.cute\" FILE_CONTENTS \"\${FILE_CONTENTS}\") + file(WRITE \${DST_FILE} \"\${FILE_CONTENTS}\") + endforeach() +" COMPONENT _vllm_fa4_cutedsl_C) diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 8268065ef02c896f24ff112326a8bc8b08976222..758a777955535e0a948f63c810a5fdef4c1b1e11 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -5,6 +5,7 @@ #include #include "cuda_compat.h" +#include "cuda_vec_utils.cuh" #include "dispatch_utils.h" namespace vllm { @@ -16,52 +17,55 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x, return act_first ? ACT_FN(x) * y : x * ACT_FN(y); } -// Check if all pointers are 16-byte aligned for int4 vectorized access -__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) { - return (reinterpret_cast(ptr) & 15) == 0; +template +__device__ __forceinline__ packed_t packed_compute(const packed_t& x, + const packed_t& y) { + return act_first ? packed_mul(PACKED_ACT_FN(x), y) + : packed_mul(x, PACKED_ACT_FN(y)); } // Activation and gating kernel template. -template +template __global__ void act_and_mul_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., 2, d] const int d) { - constexpr int VEC_SIZE = 16 / sizeof(scalar_t); - const int64_t token_idx = blockIdx.x; - const scalar_t* x_ptr = input + token_idx * 2 * d; + const scalar_t* x_ptr = input + blockIdx.x * 2 * d; const scalar_t* y_ptr = x_ptr + d; - scalar_t* out_ptr = out + token_idx * d; + scalar_t* out_ptr = out + blockIdx.x * d; - // Check alignment for 128-bit vectorized access. - // All three pointers must be 16-byte aligned for safe int4 operations. - const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) && - is_16byte_aligned(out_ptr); + if constexpr (use_vec) { + using cuda_t = typename CUDATypeConverter::Type; + using pvec_t = PackedVec; - if (aligned && d >= VEC_SIZE) { - // Fast path: 128-bit vectorized loop - const int4* x_vec = reinterpret_cast(x_ptr); - const int4* y_vec = reinterpret_cast(y_ptr); - int4* out_vec = reinterpret_cast(out_ptr); - const int num_vecs = d / VEC_SIZE; - const int vec_end = num_vecs * VEC_SIZE; + const pvec_t* x_vec = reinterpret_cast(x_ptr); + const pvec_t* y_vec = reinterpret_cast(y_ptr); + pvec_t* out_vec = reinterpret_cast(out_ptr); + const int num_vecs = d / 2 / pvec_t::NUM_ELTS; for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { - int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r; - auto* xp = reinterpret_cast(&x); - auto* yp = reinterpret_cast(&y); - auto* rp = reinterpret_cast(&r); + pvec_t x, y; + if constexpr (use_256b) { + ld256(x, &x_vec[i]); + ld256(y, &y_vec[i]); + } else { + ld128(x, &x_vec[i]); + ld128(y, &y_vec[i]); + } #pragma unroll - for (int j = 0; j < VEC_SIZE; j++) { - rp[j] = compute(xp[j], yp[j]); + for (int j = 0; j < pvec_t::NUM_ELTS; j++) { + x.elts[j] = packed_compute( + x.elts[j], y.elts[j]); + } + if constexpr (use_256b) { + st256(x, &out_vec[i]); + } else { + st128(x, &out_vec[i]); } - out_vec[i] = r; - } - // Scalar cleanup for remaining elements - for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { - out_ptr[i] = compute(VLLM_LDG(&x_ptr[i]), - VLLM_LDG(&y_ptr[i])); } } else { // Scalar fallback for unaligned data or small d @@ -79,6 +83,15 @@ __device__ __forceinline__ T silu_kernel(const T& x) { return (T)(((float)x) / (1.0f + expf((float)-x))); } +template +__device__ __forceinline__ packed_t packed_silu_kernel(const packed_t& val) { + // x * sigmoid(x) + float2 fval = cast_to_float2(val); + fval.x = fval.x / (1.0f + expf(-fval.x)); + fval.y = fval.y / (1.0f + expf(-fval.y)); + return cast_to_packed(fval); +} + template __device__ __forceinline__ T gelu_kernel(const T& x) { // Equivalent to PyTorch GELU with 'none' approximation. @@ -89,6 +102,18 @@ __device__ __forceinline__ T gelu_kernel(const T& x) { return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA))); } +template +__device__ __forceinline__ packed_t packed_gelu_kernel(const packed_t& val) { + // Equivalent to PyTorch GELU with 'none' approximation. + // Refer to: + // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38 + constexpr float ALPHA = M_SQRT1_2; + float2 fval = cast_to_float2(val); + fval.x = fval.x * 0.5f * (1.0f + ::erf(fval.x * ALPHA)); + fval.y = fval.y * 0.5f * (1.0f + ::erf(fval.y * ALPHA)); + return cast_to_packed(fval); +} + template __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { // Equivalent to PyTorch GELU with 'tanh' approximation. @@ -102,32 +127,86 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { return (T)(0.5f * f * (1.0f + ::tanhf(inner))); } +template +__device__ __forceinline__ packed_t +packed_gelu_tanh_kernel(const packed_t& val) { + // Equivalent to PyTorch GELU with 'tanh' approximation. + // Refer to: + // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30 + float2 fval = cast_to_float2(val); + constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f; + constexpr float KAPPA = 0.044715; + + float x_cube = fval.x * fval.x * fval.x; + float inner = BETA * (fval.x + KAPPA * x_cube); + fval.x = 0.5f * fval.x * (1.0f + ::tanhf(inner)); + + x_cube = fval.y * fval.y * fval.y; + inner = BETA * (fval.y + KAPPA * x_cube); + fval.y = 0.5f * fval.y * (1.0f + ::tanhf(inner)); + return cast_to_packed(fval); +} + } // namespace vllm // Launch activation and gating kernel. // Use ACT_FIRST (bool) indicating whether to apply the activation function // first. -#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST) \ - int d = input.size(-1) / 2; \ - int64_t num_tokens = input.numel() / input.size(-1); \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - if (num_tokens == 0) { \ - return; \ - } \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), "act_and_mul_kernel", [&] { \ - vllm::act_and_mul_kernel, ACT_FIRST> \ - <<>>(out.data_ptr(), \ - input.data_ptr(), d); \ - }); +#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, PACKED_KERNEL, ACT_FIRST) \ + auto dtype = input.scalar_type(); \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + if (num_tokens == 0) { \ + return; \ + } \ + dim3 grid(num_tokens); \ + int cc_major = at::cuda::getCurrentDeviceProperties()->major; \ + int support_vec = \ + (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) \ + ? vllm::VecTraits::ARCH_MAX_VEC_SIZE \ + : vllm::VecTraits::ARCH_MAX_VEC_SIZE; \ + int vec_size = support_vec / at::elementSize(dtype); \ + const bool use_vec = (d % vec_size == 0); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + if (use_vec) { \ + dim3 block(std::min(d / vec_size, 1024)); \ + if (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) { \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \ + vllm::act_and_mul_kernel< \ + scalar_t, typename vllm::PackedTypeConverter::Type, \ + KERNEL, \ + PACKED_KERNEL::Type>, \ + ACT_FIRST, true, true><<>>( \ + out.data_ptr(), input.data_ptr(), d); \ + }); \ + } else { \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \ + vllm::act_and_mul_kernel< \ + scalar_t, typename vllm::PackedTypeConverter::Type, \ + KERNEL, \ + PACKED_KERNEL::Type>, \ + ACT_FIRST, true, false><<>>( \ + out.data_ptr(), input.data_ptr(), d); \ + }); \ + } \ + } else { \ + dim3 block(std::min(d, 1024)); \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \ + vllm::act_and_mul_kernel< \ + scalar_t, typename vllm::PackedTypeConverter::Type, \ + KERNEL, \ + PACKED_KERNEL::Type>, \ + ACT_FIRST, false><<>>( \ + out.data_ptr(), input.data_ptr(), d); \ + }); \ + } void silu_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel, + true); } void mul_and_silu(torch::Tensor& out, // [..., d] @@ -135,19 +214,22 @@ void mul_and_silu(torch::Tensor& out, // [..., d] { // The difference between mul_and_silu and silu_and_mul is that mul_and_silu // applies the silu to the latter half of the input. - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel, + false); } void gelu_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, vllm::packed_gelu_kernel, + true); } void gelu_tanh_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, + vllm::packed_gelu_tanh_kernel, true); } namespace vllm { @@ -158,42 +240,53 @@ __device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) { return (T)(f > threshold ? f : 0.0f); } -template +template +__device__ __forceinline__ packed_t +packed_fatrelu_kernel(const packed_t& val, const float threshold) { + float2 fval = cast_to_float2(val); + fval.x = fval.x > threshold ? fval.x : 0.0f; + fval.y = fval.y > threshold ? fval.y : 0.0f; + return cast_to_packed(fval); +} + +template __global__ void act_and_mul_kernel_with_param( scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d, const float param) { - constexpr int VEC_SIZE = 16 / sizeof(scalar_t); - const int64_t token_idx = blockIdx.x; - const scalar_t* x_ptr = input + token_idx * 2 * d; + const scalar_t* x_ptr = input + blockIdx.x * 2 * d; const scalar_t* y_ptr = x_ptr + d; - scalar_t* out_ptr = out + token_idx * d; + scalar_t* out_ptr = out + blockIdx.x * d; - // Check alignment for 128-bit vectorized access - const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) && - is_16byte_aligned(out_ptr); + if constexpr (use_vec) { + using cuda_t = typename CUDATypeConverter::Type; + using pvec_t = PackedVec; - if (aligned && d >= VEC_SIZE) { - // Fast path: 128-bit vectorized loop - const int4* x_vec = reinterpret_cast(x_ptr); - const int4* y_vec = reinterpret_cast(y_ptr); - int4* out_vec = reinterpret_cast(out_ptr); - const int num_vecs = d / VEC_SIZE; - const int vec_end = num_vecs * VEC_SIZE; + const pvec_t* x_vec = reinterpret_cast(x_ptr); + const pvec_t* y_vec = reinterpret_cast(y_ptr); + pvec_t* out_vec = reinterpret_cast(out_ptr); + const int num_vecs = d / 2 / pvec_t::NUM_ELTS; for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { - int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r; - auto* xp = reinterpret_cast(&x); - auto* yp = reinterpret_cast(&y); - auto* rp = reinterpret_cast(&r); + pvec_t x, y; + if constexpr (use_256b) { + ld256(x, &x_vec[i]); + ld256(y, &y_vec[i]); + } else { + ld128(x, &x_vec[i]); + ld128(y, &y_vec[i]); + } #pragma unroll - for (int j = 0; j < VEC_SIZE; j++) { - rp[j] = ACT_FN(xp[j], param) * yp[j]; + for (int j = 0; j < pvec_t::NUM_ELTS; j++) { + x.elts[j] = packed_mul(PACKED_ACT_FN(x.elts[j], param), y.elts[j]); + } + if constexpr (use_256b) { + st256(x, &out_vec[i]); + } else { + st128(x, &out_vec[i]); } - out_vec[i] = r; - } - // Scalar cleanup for remaining elements - for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { - out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]); } } else { // Scalar fallback for unaligned data or small d @@ -276,20 +369,61 @@ __global__ void swigluoai_and_mul_kernel( } // namespace vllm -#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \ - int d = input.size(-1) / 2; \ - int64_t num_tokens = input.numel() / input.size(-1); \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), "act_and_mul_kernel_with_param", [&] { \ - vllm::act_and_mul_kernel_with_param> \ - <<>>(out.data_ptr(), \ - input.data_ptr(), d, \ - PARAM); \ - }); +#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PACKED_KERNEL, PARAM) \ + auto dtype = input.scalar_type(); \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + if (num_tokens == 0) { \ + return; \ + } \ + dim3 grid(num_tokens); \ + int cc_major = at::cuda::getCurrentDeviceProperties()->major; \ + int support_vec = \ + (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) \ + ? vllm::VecTraits::ARCH_MAX_VEC_SIZE \ + : vllm::VecTraits::ARCH_MAX_VEC_SIZE; \ + int vec_size = support_vec / at::elementSize(dtype); \ + const bool use_vec = (d % vec_size == 0); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + if (use_vec) { \ + dim3 block(std::min(d / vec_size, 1024)); \ + if (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) { \ + VLLM_DISPATCH_FLOATING_TYPES( \ + dtype, "act_and_mul_kernel_with_param", [&] { \ + vllm::act_and_mul_kernel_with_param< \ + scalar_t, typename vllm::PackedTypeConverter::Type, \ + KERNEL, \ + PACKED_KERNEL< \ + typename vllm::PackedTypeConverter::Type>, \ + true, true><<>>( \ + out.data_ptr(), input.data_ptr(), d, \ + PARAM); \ + }); \ + } else { \ + VLLM_DISPATCH_FLOATING_TYPES( \ + dtype, "act_and_mul_kernel_with_param", [&] { \ + vllm::act_and_mul_kernel_with_param< \ + scalar_t, typename vllm::PackedTypeConverter::Type, \ + KERNEL, \ + PACKED_KERNEL< \ + typename vllm::PackedTypeConverter::Type>, \ + true, false><<>>( \ + out.data_ptr(), input.data_ptr(), d, \ + PARAM); \ + }); \ + } \ + } else { \ + dim3 block(std::min(d, 1024)); \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel_with_param", [&] { \ + vllm::act_and_mul_kernel_with_param< \ + scalar_t, typename vllm::PackedTypeConverter::Type, \ + KERNEL, \ + PACKED_KERNEL::Type>, \ + false><<>>( \ + out.data_ptr(), input.data_ptr(), d, PARAM); \ + }); \ + } #define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT) \ int d = input.size(-1) / 2; \ @@ -309,7 +443,8 @@ __global__ void swigluoai_and_mul_kernel( void fatrelu_and_mul(torch::Tensor& out, // [..., d], torch::Tensor& input, // [..., 2 * d] double threshold) { - LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold); + LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM( + vllm::fatrelu_kernel, vllm::packed_fatrelu_kernel, threshold); } void swigluoai_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input, // [..., 2 * d] @@ -319,39 +454,41 @@ void swigluoai_and_mul(torch::Tensor& out, // [..., d] namespace vllm { // Element-wise activation kernel template. -template +template __global__ void activation_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., d] const int d) { - constexpr int VEC_SIZE = 16 / sizeof(scalar_t); - const int64_t token_idx = blockIdx.x; - const scalar_t* in_ptr = input + token_idx * d; - scalar_t* out_ptr = out + token_idx * d; - - // Check alignment for 128-bit vectorized access - const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr); - - if (aligned && d >= VEC_SIZE) { - // Fast path: 128-bit vectorized loop - const int4* in_vec = reinterpret_cast(in_ptr); - int4* out_vec = reinterpret_cast(out_ptr); + const scalar_t* in_ptr = input + blockIdx.x * d; + scalar_t* out_ptr = out + blockIdx.x * d; + + if constexpr (use_vec) { + // Fast path: 128-bit/256-bit vectorized loop + using vec_t = typename VecTraits::vec_t; + constexpr int ARCH_MAX_VEC_SIZE = VecTraits::ARCH_MAX_VEC_SIZE; + constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(scalar_t); + const vec_t* in_vec = reinterpret_cast(in_ptr); + vec_t* out_vec = reinterpret_cast(out_ptr); const int num_vecs = d / VEC_SIZE; - const int vec_end = num_vecs * VEC_SIZE; for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { - int4 v = VLLM_LDG(&in_vec[i]), r; + vec_t v; + if constexpr (use_256b) { + ld256(v, &in_vec[i]); + } else { + v = VLLM_LDG(&in_vec[i]); + } auto* vp = reinterpret_cast(&v); - auto* rp = reinterpret_cast(&r); #pragma unroll for (int j = 0; j < VEC_SIZE; j++) { - rp[j] = ACT_FN(vp[j]); + vp[j] = ACT_FN(vp[j]); + } + if constexpr (use_256b) { + st256(v, &out_vec[i]); + } else { + out_vec[i] = v; } - out_vec[i] = r; - } - // Scalar cleanup for remaining elements - for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { - out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i])); } } else { // Scalar fallback for unaligned data or small d @@ -365,18 +502,46 @@ __global__ void activation_kernel( } // namespace vllm // Launch element-wise activation kernel. -#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \ - int d = input.size(-1); \ - int64_t num_tokens = input.numel() / d; \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \ - vllm::activation_kernel> \ - <<>>(out.data_ptr(), \ - input.data_ptr(), d); \ - }); +#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \ + auto dtype = input.scalar_type(); \ + int d = input.size(-1); \ + int64_t num_tokens = input.numel() / input.size(-1); \ + if (num_tokens == 0) { \ + return; \ + } \ + dim3 grid(num_tokens); \ + int cc_major = at::cuda::getCurrentDeviceProperties()->major; \ + int support_vec = \ + (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) \ + ? vllm::VecTraits::ARCH_MAX_VEC_SIZE \ + : vllm::VecTraits::ARCH_MAX_VEC_SIZE; \ + int vec_size = support_vec / at::elementSize(dtype); \ + const bool use_vec = (d % vec_size == 0); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + if (use_vec) { \ + dim3 block(std::min(d / vec_size, 1024)); \ + if (CUDA_VERSION >= 12090 && cc_major >= 10 && num_tokens > 128) { \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \ + vllm::activation_kernel, true, true> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d); \ + }); \ + } else { \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \ + vllm::activation_kernel, true, false> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d); \ + }); \ + } \ + } else { \ + dim3 block(std::min(d, 1024)); \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \ + vllm::activation_kernel, false> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d); \ + }); \ + } namespace vllm { diff --git a/csrc/cache.h b/csrc/cache.h index 22a58389e74c7e415cd07fad6407e218d050f995..4ffc57e245ea09401750b46124cf584882313917 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -74,6 +74,12 @@ void indexer_k_quant_and_cache( int64_t quant_block_size, // quantization block size const std::string& scale_fmt); +// Concatenate query nope and rope for MLA/DSA attention +void concat_mla_q( + torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim] + torch::Tensor& q_pe, // [num_tokens, num_heads, rope_dim] + torch::Tensor& q_out); // [num_tokens, num_heads, nope_dim + rope_dim] + // Extract function to gather quantized K cache void cp_gather_indexer_k_quant_cache( const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride] diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 064a26888eee64786a4b6e4d0215b5a74ffee44b..ce307a56904d73b2ecc0bd0d83261dd33245029a 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -8,6 +8,7 @@ #include "cuda_compat.h" #include "dispatch_utils.h" #include "quantization/vectorization_utils.cuh" +#include "concat_mla_q.cuh" #ifdef USE_ROCM #include "quantization/w8a8/fp8/amd/quant_utils.cuh" @@ -918,8 +919,8 @@ __global__ void gather_and_maybe_dequant_cache( // SCALAR_T is the data type of the destination tensor. // CACHE_T is the stored data type of kv-cache. // KV_DTYPE is the real data type of kv-cache. -#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \ - vllm::gather_and_maybe_dequant_cache \ <<>>( \ reinterpret_cast(src_cache.data_ptr()), \ @@ -930,6 +931,12 @@ __global__ void gather_and_maybe_dequant_cache( dst_entry_stride, reinterpret_cast(scale.data_ptr()), \ seq_starts_ptr); +#define CALL_GATHER_CACHE_576(SCALAR_T, CACHE_T, KV_DTYPE) \ + CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 576) + +#define CALL_GATHER_CACHE_320(SCALAR_T, CACHE_T, KV_DTYPE) \ + CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE, 320) + // Gather sequences from the cache into the destination tensor. // - cu_seq_lens contains the cumulative sequence lengths for each batch // - block_table contains the cache block indices for each sequence @@ -959,9 +966,10 @@ void gather_and_maybe_dequant_cache( TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32, "seq_starts must be int32"); } - TORCH_CHECK(head_dim == 576, - "gather_and_maybe_dequant_cache only support the head_dim to 576 " - "for better performance") + TORCH_CHECK( + head_dim == 320 || head_dim == 576, + "gather_and_maybe_dequant_cache only support the head_dim to 320 or 576 " + "for better performance") TORCH_CHECK(src_cache.device() == dst.device(), "src_cache and dst must be on the same device"); @@ -986,7 +994,13 @@ void gather_and_maybe_dequant_cache( const int32_t* seq_starts_ptr = seq_starts.has_value() ? seq_starts.value().data_ptr() : nullptr; - DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE); + if (head_dim == 576) { + DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, + CALL_GATHER_CACHE_576); + } else { + DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, + CALL_GATHER_CACHE_320); + } } namespace vllm { @@ -995,75 +1009,67 @@ namespace vllm { // Similar to cp_gather_cache but specifically for FP8->BF16 conversion __global__ void cp_gather_and_upconvert_fp8_kv_cache( const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656] - __nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576] - const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] - const int32_t* __restrict__ seq_lens, // [BATCH] - const int32_t* __restrict__ workspace_starts, // [BATCH] - const int32_t block_size, const int32_t head_dim, - const int64_t block_table_stride, const int64_t cache_block_stride, - const int64_t cache_entry_stride, const int64_t dst_entry_stride) { - const int64_t bid = blockIdx.x; // Batch ID - const int32_t num_splits = gridDim.y; - const int32_t split = blockIdx.y; - const int32_t seq_start = workspace_starts[bid]; - const int32_t seq_len = seq_lens[bid]; - const int32_t tot_slots = seq_len; - const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits); + __nv_bfloat16* __restrict__ dst, // [total_tokens, 576] + const int32_t* __restrict__ block_table, // [num_reqs, BLOCK_INDICES] + const int32_t* __restrict__ workspace_starts, // [num_reqs] + const int32_t num_reqs, const int32_t block_size, + const int32_t total_tokens, const int64_t block_table_stride, + const int64_t cache_block_stride, const int64_t cache_entry_stride, + const int64_t dst_entry_stride) { + const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5; + if (flat_warp_id >= total_tokens) return; + const int lane_id = threadIdx.x & 31; + + // Binary search to find which request owns this output token + int lo = 0, hi = num_reqs - 1; + while (lo < hi) { + int mid = (lo + hi + 1) >> 1; + if (workspace_starts[mid] <= flat_warp_id) + lo = mid; + else + hi = mid - 1; + } + const int req_id = lo; - const int32_t split_start = split * split_slots; - const int32_t split_end = min((split + 1) * split_slots, tot_slots); + // Compute physical token address via block table + const int out_token_id = flat_warp_id; + const int token_offset = out_token_id - workspace_starts[req_id]; + const int cache_block_idx = token_offset / block_size; + const int offset_in_block = token_offset % block_size; + const int physical_block = + block_table[req_id * block_table_stride + cache_block_idx]; - const bool is_active_split = (split_start < tot_slots); + const uint8_t* token_ptr = src_cache + physical_block * cache_block_stride + + offset_in_block * cache_entry_stride; - if (!is_active_split) return; + const int4* nope_src = reinterpret_cast(token_ptr); + const int4 fp8_data = nope_src[lane_id]; - // Adjust the pointer for the block_table for this batch - const int32_t batch_offset = bid * block_table_stride; - int32_t offset = split_start; - int32_t offset_div = offset / block_size; - offset = offset % block_size; - const int32_t* batch_block_table = block_table + batch_offset; + const float* scales_ptr = reinterpret_cast(token_ptr + 512); + const float scale = scales_ptr[lane_id >> 3]; - // Adjust dst pointer based on the cumulative sequence lengths - dst += seq_start * dst_entry_stride; + const uint2 fp8_lo = make_uint2(fp8_data.x, fp8_data.y); + const uint2 fp8_hi = make_uint2(fp8_data.z, fp8_data.w); +#ifdef USE_ROCM + const bf16_8_t bf16_lo = + fp8::scaled_vec_conversion(fp8_lo, scale); + const bf16_8_t bf16_hi = + fp8::scaled_vec_conversion(fp8_hi, scale); +#else + const bf16_8_t bf16_lo = + fp8::scaled_vec_conversion(fp8_lo, scale, __NV_E4M3); + const bf16_8_t bf16_hi = + fp8::scaled_vec_conversion(fp8_hi, scale, __NV_E4M3); +#endif - const int tid = threadIdx.x; + __nv_bfloat16* dst_ptr = dst + out_token_id * dst_entry_stride; + int4* nope_dst = reinterpret_cast(dst_ptr) + lane_id * 2; + nope_dst[0] = *reinterpret_cast(&bf16_lo); + nope_dst[1] = *reinterpret_cast(&bf16_hi); - // Process each token in this split - for (int pid = split_start; pid < split_end; ++pid) { - auto block_id = batch_block_table[offset_div]; - const uint8_t* token_ptr = - src_cache + block_id * cache_block_stride + offset * cache_entry_stride; - __nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride; - - // FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16) - const uint8_t* no_pe_ptr = token_ptr; - const float* scales_ptr = reinterpret_cast(token_ptr + 512); - const __nv_bfloat16* rope_ptr = - reinterpret_cast(token_ptr + 512 + 16); - - // Parallelize fp8 dequant (512 elements) and rope copy (64 elements) - if (tid < 512) { - // FP8 dequantization - const int tile = tid >> 7; // each tile is 128 elements - const float scale = scales_ptr[tile]; - const uint8_t val = no_pe_ptr[tid]; - dst_ptr[tid] = - fp8::scaled_convert<__nv_bfloat16, uint8_t, - vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale); - } else if (tid < 576) { - // Rope copy (64 bf16 elements) - const int rope_idx = tid - 512; - dst_ptr[512 + rope_idx] = rope_ptr[rope_idx]; - } - - // Move to next token - offset += 1; - if (offset == block_size) { - offset_div += 1; - offset = 0; - } - } + const int* rope_src = reinterpret_cast(token_ptr + 528); + int* rope_dst = reinterpret_cast(dst_ptr + 512); + rope_dst[lane_id] = rope_src[lane_id]; } template @@ -1234,8 +1240,13 @@ void cp_gather_and_upconvert_fp8_kv_cache( "src_cache and seq_lens must be on the same device"); TORCH_CHECK(src_cache.device() == workspace_starts.device(), "src_cache and workspace_starts must be on the same device"); - - TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8"); + auto dtype = src_cache.scalar_type(); + TORCH_CHECK( + dtype == at::ScalarType::Byte || // uint8 + dtype == at::ScalarType::Float8_e4m3fn || // fp8 e4m3 + dtype == at::ScalarType::Float8_e5m2, // fp8 e5m2 + "src_cache must be uint8, float8_e4m3fn, or float8_e5m2, but got ", + src_cache.dtype()); TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16"); TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA"); @@ -1244,16 +1255,24 @@ void cp_gather_and_upconvert_fp8_kv_cache( int64_t cache_entry_stride = src_cache.stride(1); int64_t dst_entry_stride = dst.stride(0); - // Decide on the number of splits based on the batch size - int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16; - dim3 grid(batch_size, num_splits); - dim3 block(576); + const uint8_t* src_ptr = nullptr; + if (dtype == at::ScalarType::Byte) { + src_ptr = src_cache.data_ptr(); + } else { + // float8_e4m3fn or float8_e5m2 + src_ptr = reinterpret_cast(src_cache.data_ptr()); + } + + const int total_tokens = dst.size(0); + constexpr int warps_per_block = 8; + const int grid_size = (total_tokens + warps_per_block - 1) / warps_per_block; + const int block_size_threads = warps_per_block * 32; // 256 threads - vllm::cp_gather_and_upconvert_fp8_kv_cache<<>>( - src_cache.data_ptr(), - reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()), - block_table.data_ptr(), seq_lens.data_ptr(), - workspace_starts.data_ptr(), block_size, head_dim, + vllm::cp_gather_and_upconvert_fp8_kv_cache<<>>( + src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()), + block_table.data_ptr(), workspace_starts.data_ptr(), + static_cast(batch_size), block_size, total_tokens, block_table_stride, cache_block_stride, cache_entry_stride, dst_entry_stride); } @@ -1293,7 +1312,8 @@ void indexer_k_quant_and_cache( const at::cuda::OptionalCUDAGuard device_guard(device_of(k)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3", + static const std::string kv_cache_dtype = "fp8_e4m3"; + DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), kv_cache_dtype, CALL_INDEXER_K_QUANT_AND_CACHE); } @@ -1352,3 +1372,43 @@ void cp_gather_indexer_k_quant_cache( CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32); } } + +// Concatenate ql_nope and q_pe into a contiguous q_out tensor for MLA/DSA. +// Replaces torch.cat((ql_nope, q_pe), dim=-1). +void concat_mla_q(torch::Tensor& ql_nope, // [num_tokens, num_heads, nope_dim] + torch::Tensor& q_pe, // [num_tokens, num_heads, rope_dim] + torch::Tensor& q_out // [num_tokens, num_heads, nope_dim + + // rope_dim] +) { + const int num_tokens = ql_nope.size(0); + const int num_heads = ql_nope.size(1); + const int nope_dim = ql_nope.size(2); + const int rope_dim = q_pe.size(2); + + TORCH_CHECK(nope_dim % 512 == 0, "nope_dim must be a multiple of 512, got ", + nope_dim); + TORCH_CHECK(rope_dim == 64, "rope_dim must be 64, got ", rope_dim); + TORCH_CHECK(q_out.size(2) == nope_dim + rope_dim); + + TORCH_CHECK(ql_nope.stride(2) == 1, "ql_nope must have stride 1 in dim 2"); + TORCH_CHECK(q_pe.stride(2) == 1, "q_pe must have stride 1 in dim 2"); + TORCH_CHECK(q_out.stride(2) == 1, "q_out must have stride 1 in dim 2"); + + if (num_tokens == 0) return; + + constexpr int warps_per_block = 8; + const int total_warps = num_tokens * num_heads; + const int grid_size = (total_warps + warps_per_block - 1) / warps_per_block; + const int block_size = warps_per_block * 32; + + const at::cuda::OptionalCUDAGuard device_guard(device_of(ql_nope)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + VLLM_DISPATCH_FLOATING_TYPES(ql_nope.scalar_type(), "concat_mla_q", [&] { + vllm::ConcatMLAQKernel<<>>( + q_out.data_ptr(), ql_nope.data_ptr(), + q_pe.data_ptr(), num_tokens, num_heads, q_out.stride(0), + q_out.stride(1), ql_nope.stride(0), ql_nope.stride(1), q_pe.stride(0), + q_pe.stride(1)); + }); +} diff --git a/csrc/concat_mla_q.cuh b/csrc/concat_mla_q.cuh new file mode 100644 index 0000000000000000000000000000000000000000..68bcfa011fb3edd0e38446604ce91271e0fb9e51 --- /dev/null +++ b/csrc/concat_mla_q.cuh @@ -0,0 +1,60 @@ +#ifndef CONCAT_MLA_Q_CUH_ +#define CONCAT_MLA_Q_CUH_ + +#include +#include + +#include "cuda_vec_utils.cuh" + +namespace vllm { + +// Concatenates ql_nope [num_tokens, num_heads, NOPE_DIM] and +// q_pe [num_tokens, num_heads, 64] +// into q_out [num_tokens, num_heads, NOPE_DIM+64]. +// Currently instantiated only for NOPE_DIM=512. +// Rope dim is hardcoded to 64 (DeepSeek V3.2 MLA) +template +__global__ void ConcatMLAQKernel( + DType* __restrict__ q_out, const DType* __restrict__ ql_nope, + const DType* __restrict__ q_pe, const int num_tokens, const int num_heads, + const int64_t out_stride_0, const int64_t out_stride_1, + const int64_t nope_stride_0, const int64_t nope_stride_1, + const int64_t pe_stride_0, const int64_t pe_stride_1) { + const int flat_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) >> 5; + if (flat_warp_id >= num_tokens * num_heads) return; + + const int token_id = flat_warp_id / num_heads; + const int head_id = flat_warp_id % num_heads; + const int lane_id = threadIdx.x & 31; + + constexpr bool use_256b = VLLM_256B_PTX_ENABLED; + constexpr int nope_vec_loads = + NOPE_DIM * sizeof(DType) / (VecTraits::ARCH_MAX_VEC_SIZE * 32); + + const DType* nope_src = + ql_nope + token_id * nope_stride_0 + head_id * nope_stride_1; + DType* nope_dst = q_out + token_id * out_stride_0 + head_id * out_stride_1; + +#pragma unroll + for (int i = 0; i < nope_vec_loads; i++) { + const int offset = i * 32 + lane_id; + if constexpr (use_256b) { + st256_cs(reinterpret_cast(nope_dst) + offset, + ld256_cs(reinterpret_cast(nope_src) + offset)); + } else { + st128_cs(reinterpret_cast(nope_dst) + offset, + ld128_cs(reinterpret_cast(nope_src) + offset)); + } + } + + const int* rope_src = reinterpret_cast( + q_pe + token_id * pe_stride_0 + head_id * pe_stride_1); + int* rope_dst = reinterpret_cast(q_out + token_id * out_stride_0 + + head_id * out_stride_1 + NOPE_DIM); + + st32_cs(rope_dst + lane_id, ld32_cs(rope_src + lane_id)); +} + +} // namespace vllm + +#endif // CONCAT_MLA_Q_CUH_ diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp index 641f95a2b1dfcc25f79ae02bee0960ad1d728669..a582b4b4d7cc7004d423025228d94cca1ea2bc46 100644 --- a/csrc/cpu/cpu_attn.cpp +++ b/csrc/cpu/cpu_attn.cpp @@ -16,6 +16,8 @@ torch::Tensor get_scheduler_metadata( isa = cpu_attention::ISA::VEC16; } else if (isa_hint == "neon") { isa = cpu_attention::ISA::NEON; + } else if (isa_hint == "vxe") { + isa = cpu_attention::ISA::VXE; } else { TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint); } @@ -100,6 +102,8 @@ void cpu_attn_reshape_and_cache( return cpu_attention::ISA::VEC16; } else if (isa == "neon") { return cpu_attention::ISA::NEON; + } else if (isa == "vxe") { + return cpu_attention::ISA::VXE; } else { TORCH_CHECK(false, "Invalid ISA type: " + isa); } diff --git a/csrc/cpu/cpu_attn_amx.hpp b/csrc/cpu/cpu_attn_amx.hpp index 8da458b99119c31667ff875eeb947e5979f65968..1c8644d52329a752dff6a1e676923f43d72f8a2a 100644 --- a/csrc/cpu/cpu_attn_amx.hpp +++ b/csrc/cpu/cpu_attn_amx.hpp @@ -420,7 +420,7 @@ class AttentionImpl { const int64_t block_size, const int64_t block_size_stride) { // For AMX 2D tiles, size of each line is 64 bytes constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES; - // For AMX B martix, N always is 16 + // For AMX B matrix, N always is 16 constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4; constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t); // For now suppose block_size is divisible by amx_tile_column_num diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp index 89cf2dc3a4f4a535d04dfd2f3d7b846481d70aea..c15799fa950d320d26735c036d7e602f94edc92e 100644 --- a/csrc/cpu/cpu_attn_impl.hpp +++ b/csrc/cpu/cpu_attn_impl.hpp @@ -12,7 +12,7 @@ #include "cpu/utils.hpp" namespace cpu_attention { -enum class ISA { AMX, VEC, VEC16, NEON }; +enum class ISA { AMX, VEC, VEC16, NEON, VXE }; template class AttentionImpl {}; @@ -821,7 +821,7 @@ struct VecTypeTrait { using vec_t = vec_op::BF16Vec16; }; -#if !defined(__powerpc__) && !defined(__s390x__) +#if !defined(__powerpc__) template <> struct VecTypeTrait { using vec_t = vec_op::FP16Vec16; diff --git a/csrc/cpu/cpu_attn_vxe.hpp b/csrc/cpu/cpu_attn_vxe.hpp new file mode 100644 index 0000000000000000000000000000000000000000..45db4ebd73967f34e654932749f2f4a4117cb6d6 --- /dev/null +++ b/csrc/cpu/cpu_attn_vxe.hpp @@ -0,0 +1,386 @@ +#ifndef CPU_ATTN_VXE_HPP +#define CPU_ATTN_VXE_HPP + +#include "cpu_attn_impl.hpp" +#include +#include + +namespace cpu_attention { + +namespace { + +// s390x Vector = 16 bytes (128 bits) +#define BLOCK_SIZE_ALIGNMENT 32 +#define HEAD_SIZE_ALIGNMENT 32 +#define MAX_Q_HEAD_NUM_PER_ITER 16 + +template +FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, __vector float& b0, + __vector float& b1); + +// [1] Float Specialization +template <> +FORCE_INLINE void load_row8_B_as_f32(const float* p, __vector float& b0, + __vector float& b1) { + // Explicitly cast to long long for offset, and float* for pointer + b0 = vec_xl((long long)0, const_cast(p)); + b1 = vec_xl((long long)0, const_cast(p + 4)); +} + +// [2] BFloat16 Specialization (Big Endian Fix) +template <> +FORCE_INLINE void load_row8_B_as_f32(const c10::BFloat16* p, + __vector float& b0, + __vector float& b1) { + // 1. Load 8 BF16s (16 bytes) into one vector + // Explicit cast to unsigned short* for vec_xl to return vector unsigned short + __vector unsigned short raw = vec_xl((long long)0, (unsigned short*)p); + + // 2. Prepare Zero vector + __vector unsigned short zeros = vec_splat_u16(0); + + // 3. Merge High/Low to expand BF16 -> Float32 + // On Big Endian, a float is [BF16_bits | 16_zero_bits] + b0 = (__vector float)vec_mergeh(raw, zeros); + b1 = (__vector float)vec_mergel(raw, zeros); +} + +template <> +FORCE_INLINE void load_row8_B_as_f32(const c10::Half* p, + __vector float& b0, + __vector float& b1) { + alignas(16) float tmp[8]; + + // Manual unroll / conversion + tmp[0] = static_cast(p[0]); + tmp[1] = static_cast(p[1]); + tmp[2] = static_cast(p[2]); + tmp[3] = static_cast(p[3]); + tmp[4] = static_cast(p[4]); + tmp[5] = static_cast(p[5]); + tmp[6] = static_cast(p[6]); + tmp[7] = static_cast(p[7]); + + // Explicit arguments for intrinsic: (long long offset, float* ptr) + b0 = vec_xl((long long)0, (float*)tmp); + b1 = vec_xl((long long)0, (float*)(tmp + 4)); +} + +template +FORCE_INLINE void gemm_micro_s390x_Mx8_Ku4( + const float* __restrict A, // [M x K] + const kv_cache_t* __restrict B, // [K x 8] + float* __restrict C, // [M x 8] + int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) { + static_assert(1 <= M && M <= 8, "M must be in [1,8]"); + +// Helper macros to unroll codegen for M rows +#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7) +#define IF_M(i) if constexpr (M > (i)) + + // 1. Define A pointers +#define DECL_A(i) const float* a##i = A + (i) * lda; + ROWS_APPLY(DECL_A) +#undef DECL_A + + // 2. Define Accumulators (2 vectors covers 8 columns) +#define DECL_ACC(i) __vector float acc##i##_0, acc##i##_1; + ROWS_APPLY(DECL_ACC) +#undef DECL_ACC + + // 3. Initialize Accumulators (Load C or Zero) +#define INIT_ACC(i) \ + IF_M(i) { \ + if (accumulate) { \ + acc##i##_0 = \ + vec_xl((long long)0, const_cast(C + (i) * ldc + 0)); \ + acc##i##_1 = \ + vec_xl((long long)0, const_cast(C + (i) * ldc + 4)); \ + } else { \ + acc##i##_0 = vec_splats(0.0f); \ + acc##i##_1 = vec_splats(0.0f); \ + } \ + } + ROWS_APPLY(INIT_ACC) +#undef INIT_ACC + + int32_t k = 0; + + for (; k + 3 < K; k += 4) { + // Load 4 values of A for each Row M: A[k...k+3] +#define LOAD_A4(i) \ + __vector float a##i##v; \ + IF_M(i) a##i##v = vec_xl((long long)0, const_cast(a##i + k)); + ROWS_APPLY(LOAD_A4) +#undef LOAD_A4 + + // Helper: FMA for specific lane L of A + // s390x: vec_madd(b, vec_splat(a, lane), acc) +#define FMAS_LANE(i, aiv, L) \ + IF_M(i) { \ + __vector float a_broad = vec_splat(aiv, L); \ + acc##i##_0 = vec_madd(b0, a_broad, acc##i##_0); \ + acc##i##_1 = vec_madd(b1, a_broad, acc##i##_1); \ + } + + // Unroll K=0..3 + { + __vector float b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 0) * ldb, b0, b1); +#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0) + ROWS_APPLY(STEP_K0) +#undef STEP_K0 + } + { + __vector float b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 1) * ldb, b0, b1); +#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1) + ROWS_APPLY(STEP_K1) +#undef STEP_K1 + } + { + __vector float b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 2) * ldb, b0, b1); +#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2) + ROWS_APPLY(STEP_K2) +#undef STEP_K2 + } + + { + __vector float b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 3) * ldb, b0, b1); +#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3) + ROWS_APPLY(STEP_K3) +#undef STEP_K3 + } +#undef FMAS_LANE + } + + for (; k < K; ++k) { + __vector float b0, b1; + load_row8_B_as_f32(B + (int64_t)k * ldb, b0, b1); +#define TAIL_ROW(i) \ + IF_M(i) { \ + __vector float ai = vec_splats(*(a##i + k)); \ + acc##i##_0 = vec_madd(b0, ai, acc##i##_0); \ + acc##i##_1 = vec_madd(b1, ai, acc##i##_1); \ + } + ROWS_APPLY(TAIL_ROW) +#undef TAIL_ROW + } + +#define STORE_ROW(i) \ + IF_M(i) { \ + vec_xst(acc##i##_0, 0, C + (i) * ldc + 0); \ + vec_xst(acc##i##_1, 0, C + (i) * ldc + 4); \ + } + ROWS_APPLY(STORE_ROW) +#undef STORE_ROW + +#undef ROWS_APPLY +#undef IF_M +} + +template +FORCE_INLINE void gemm_macro_s390x_Mx8_Ku4(const float* __restrict A, + const kv_cache_t* __restrict B, + float* __restrict C, int32_t M, + int32_t K, int64_t lda, int64_t ldb, + int64_t ldc, bool accumulate) { + static_assert(N % 8 == 0, "N must be a multiple of 8"); + for (int32_t m = 0; m < M;) { + int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1; + const float* Ab = A + m * lda; + float* Cb = C + m * ldc; + + for (int32_t n = 0; n < N; n += 8) { + const kv_cache_t* Bn = B + n; + float* Cn = Cb + n; + switch (mb) { + case 8: + gemm_micro_s390x_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, K, + accumulate); + break; + case 4: + gemm_micro_s390x_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, K, + accumulate); + break; + case 2: + gemm_micro_s390x_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, K, + accumulate); + break; + default: + gemm_micro_s390x_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, K, + accumulate); + break; + } + } + m += mb; + } +} + +template +class TileGemmS390X { + public: + template + FORCE_INLINE static void gemm(const int32_t m_size, + float* __restrict__ a_tile, + kv_cache_t* __restrict__ b_tile, + float* __restrict__ c_tile, const int64_t lda, + const int64_t ldb, const int64_t ldc, + const int32_t block_size, + const int32_t dynamic_k_size, + const bool accum_c) { + if constexpr (phase == AttentionGemmPhase::QK) { + gemm_macro_s390x_Mx8_Ku4( + a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c); + } else { + gemm_macro_s390x_Mx8_Ku4( + a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc, + accum_c); + } + } +}; + +} // namespace + +template +class AttentionImpl { + public: + using query_t = scalar_t; + using q_buffer_t = float; + using kv_cache_t = scalar_t; + using logits_buffer_t = float; + using partial_output_buffer_t = float; + using prob_buffer_t = float; + + constexpr static int64_t BlockSizeAlignment = BLOCK_SIZE_ALIGNMENT; + constexpr static int64_t HeadDimAlignment = HEAD_SIZE_ALIGNMENT; + constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER; + constexpr static int64_t HeadDim = head_dim; + constexpr static ISA ISAType = ISA::VXE; + constexpr static bool scale_on_logits = + false; // Scale is applied to Q during copy + + public: + AttentionImpl() {} + + template