diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
index 7045d8810493e5c79d670a11401b99cf16268a2e..bbed80ebe84762412eb49b286d9007223e060085 100644
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -8,7 +8,8 @@ template = """
Links for vLLM
- {wheel}
+ {x86_wheel}
+ {arm_wheel}
"""
@@ -21,7 +22,25 @@ filename = os.path.basename(args.wheel)
with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}")
+ # sync the abi tag with .buildkite/scripts/upload-wheels.sh
+ if "x86_64" in filename:
+ x86_wheel = filename
+ arm_wheel = filename.replace("x86_64", "aarch64").replace(
+ "manylinux1", "manylinux2014"
+ )
+ elif "aarch64" in filename:
+ x86_wheel = filename.replace("aarch64", "x86_64").replace(
+ "manylinux2014", "manylinux1"
+ )
+ arm_wheel = filename
+ else:
+ raise ValueError(f"Unsupported wheel: {filename}")
# cloudfront requires escaping the '+' character
f.write(
- template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
+ template.format(
+ x86_wheel=x86_wheel,
+ x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
+ arm_wheel=arm_wheel,
+ arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
+ )
)
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
deleted file mode 100644
index 56ec933c9cc0e5e1fc8041db7f485fa272575d20..0000000000000000000000000000000000000000
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# For vllm script, with -t option (tensor parallel size).
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
-model_name: "HandH1998/QQQ-Llama-3-8b-g128"
-tasks:
-- name: "gsm8k"
- metrics:
- - name: "exact_match,strict-match"
- value: 0.419
- - name: "exact_match,flexible-extract"
- value: 0.416
-limit: 1000
-num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
index 27a1a9a82bd352623c44728e4480ee47209bd9f0..37eeac85c933b8a5a077364d0566772d2c592208 100644
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml
-Meta-Llama-3-8B-QQQ.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
index a67fc89d54e604b254e1d0c6fdbd5e6fc86ef939..897f84d1e360de11ceb10d77baf0ff9f8453cdfd 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
-# pip install lm-eval==0.4.4
+# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index b98d42aa7b822481d6b03448a4ed3b4ffe49c891..792f355c47a5178801b2624f1a9e06c69707f0ce 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
-# pip install lm-eval==0.4.4
+# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index b39f9899a8f284d1d6e1fb8f91800954f2c5fbc5..e6f5c8b60f459a1d25d45493807e407ffca5bab3 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -141,7 +141,7 @@ When run, benchmark script generates results under `benchmark/results` folder, a
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 8afde017d383e3e72c9f1171759ddf581661d8d8..37e2980eea974a5a6bf35b92aabf1c56d2f75819 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -17,7 +17,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
- - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
+ - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
- 8x Nvidia A100 GPUs
diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 12c4ba6aa69a60be66396fdb63265f79675bed4f..50431d0cd4c5e69362f4047918ec42edea497ae9 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -3,44 +3,129 @@
import argparse
import json
import os
+from importlib import util
import pandas as pd
+plotly_found = util.find_spec("plotly.express") is not None
+
def compare_data_columns(
files, name_column, data_column, info_cols, drop_column, debug=False
):
- print("\ncompare_data_column: " + data_column)
+ """
+ Align concatenation by keys derived from info_cols instead of row order.
+ - Pick one canonical key list: subset of info_cols present in ALL files.
+ - For each file: set index to those keys, aggregate duplicates
+ - (mean for metric, first for names).
+ - Concat along axis=1 (indexes align), then reset_index so callers can
+ - group by columns.
+ - If --debug, add a _name column per file.
+ """
+ print("\ncompare_data_column:", data_column)
+
frames = []
raw_data_cols = []
compare_frames = []
+
+ # 1) choose a canonical key list from info_cols that exists in ALL files
+ cols_per_file = []
+ for f in files:
+ try:
+ df_tmp = pd.read_json(f, orient="records")
+ except Exception as err:
+ raise ValueError(f"Failed to read {f}") from err
+ cols_per_file.append(set(df_tmp.columns))
+
+ key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
+ if not key_cols:
+ # soft fallback: use any info_cols present in the first file
+ key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
+ if not key_cols:
+ raise ValueError(
+ "No common key columns found from info_cols across the input files."
+ )
+
+ # 2) build a single "meta" block (keys as columns) once, aligned by the key index
+ meta_added = False
+
for file in files:
- data_df = pd.read_json(file)
- serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
- # Show all info columns in the first couple columns
- if not frames:
- for col in info_cols:
- if col not in serving_df.columns:
- print(f"Skipping missing column: {col}")
- continue
- frames.append(serving_df[col])
- # only show test name under debug mode
- if debug is True:
- serving_df = serving_df.rename(columns={name_column: file + "_name"})
- frames.append(serving_df[file + "_name"])
-
- file = "/".join(file.split("/")[:-1])
- serving_df = serving_df.rename(columns={data_column: file})
- frames.append(serving_df[file])
- raw_data_cols.append(file)
- compare_frames.append(serving_df[file])
+ df = pd.read_json(file, orient="records")
+
+ # Keep rows that actually have the compared metric (same as original behavior)
+ if drop_column in df.columns:
+ df = df.dropna(subset=[drop_column], ignore_index=True)
+
+ # Stabilize numeric key columns (harmless if missing)
+ for c in (
+ "Input Len",
+ "Output Len",
+ "TP Size",
+ "PP Size",
+ "# of max concurrency.",
+ "qps",
+ ):
+ if c in df.columns:
+ df[c] = pd.to_numeric(df[c], errors="coerce")
+
+ # Ensure all key columns exist
+ for c in key_cols:
+ if c not in df.columns:
+ df[c] = pd.NA
+
+ # Set index = key_cols and aggregate duplicates → unique MultiIndex
+ df_idx = df.set_index(key_cols, drop=False)
+
+ # meta (key columns), unique per key
+ meta = df_idx[key_cols]
+ if not meta.index.is_unique:
+ meta = meta.groupby(level=key_cols, dropna=False).first()
+
+ # metric series for this file, aggregated to one row per key
+ file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
+ s = df_idx[data_column]
+ if not s.index.is_unique:
+ s = s.groupby(level=key_cols, dropna=False).mean()
+ s.name = file_label # column label like original
+
+ # add meta once (from first file) so keys are the leftmost columns
+ if not meta_added:
+ frames.append(meta)
+ meta_added = True
+
+ # (NEW) debug: aligned test-name column per file
+ if debug and name_column in df_idx.columns:
+ name_s = df_idx[name_column]
+ if not name_s.index.is_unique:
+ name_s = name_s.groupby(level=key_cols, dropna=False).first()
+ name_s.name = f"{file_label}_name"
+ frames.append(name_s)
+
+ frames.append(s)
+ raw_data_cols.append(file_label)
+ compare_frames.append(s)
+
+ # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
if len(compare_frames) >= 2:
- # Compare numbers among two files
- ratio_df = compare_frames[1] / compare_frames[0]
- frames.append(ratio_df)
- compare_frames.pop(1)
+ base = compare_frames[0]
+ current = compare_frames[-1]
+ ratio = current / base
+ ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
+ ratio.name = f"Ratio 1 vs {len(compare_frames)}"
+ frames.append(ratio)
+ # 4) concat on columns with aligned MultiIndex;
+ # then reset_index to return keys as columns
concat_df = pd.concat(frames, axis=1)
+ concat_df = concat_df.reset_index(drop=True).reset_index()
+ if "index" in concat_df.columns:
+ concat_df = concat_df.drop(columns=["index"])
+
+ # Ensure key/info columns appear first (in your info_cols order)
+ front = [c for c in info_cols if c in concat_df.columns]
+ rest = [c for c in concat_df.columns if c not in front]
+ concat_df = concat_df[front + rest]
+
print(raw_data_cols)
return concat_df, raw_data_cols
@@ -67,6 +152,15 @@ def split_json_by_tp_pp(
df = pd.DataFrame(data)
+ # Keep only "serving" tests
+ name_col = next(
+ (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
+ )
+ if name_col:
+ df = df[
+ df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
+ ].copy()
+
# Handle alias column names
rename_map = {
"tp_size": "TP Size",
@@ -181,7 +275,6 @@ if __name__ == "__main__":
f"Expected subset: {filtered_info_cols}, "
f"but DataFrame has: {list(output_df.columns)}"
)
-
output_df_sorted = output_df.sort_values(by=existing_group_cols)
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
for name, group in output_groups:
@@ -189,8 +282,7 @@ if __name__ == "__main__":
text_file.write(html_msgs_for_data_cols[i])
text_file.write(html)
- if plot is True:
- import pandas as pd
+ if plot and plotly_found:
import plotly.express as px
df = group[raw_data_cols]
diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 06d7b5ed484da2c9e9747316a2460bb67ce6c407..a00de940cbbb81bff911996a7ec56d0c83c8470d 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -382,7 +382,7 @@ run_genai_perf_tests() {
client_command="genai-perf profile \
-m $model \
--service-kind openai \
- --backend vllm \
+ --backend "$backend" \
--endpoint-type chat \
--streaming \
--url localhost:$port \
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 85d3e5638742180ce151769bd462627241ead66b..92a1bcada3879bbd5299b7359aa94e4c6648142f 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -7,7 +7,7 @@ steps:
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -27,7 +27,12 @@ steps:
env:
DOCKER_BUILDKIT: "1"
+ - block: "Build CUDA 12.6 wheel"
+ key: block-build-cu126-wheel
+ depends_on: ~
+
- label: "Build wheel - CUDA 12.6"
+ depends_on: block-build-cu126-wheel
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
@@ -57,23 +62,45 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- - block: "Build release image"
+ - label: "Build release image (x86)"
depends_on: ~
- key: block-release-image-build
-
- - label: "Build release image"
- depends_on: block-release-image-build
- id: build-release-image
+ id: build-release-image-x86
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+ # re-tag to default image tag and push, just in case arm64 build fails
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ - label: "Build release image (arm64)"
+ depends_on: ~
+ id: build-release-image-arm64
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+ # Add job to create multi-arch manifest
+ - label: "Create multi-arch manifest"
+ depends_on:
+ - build-release-image-x86
+ - build-release-image-arm64
+ id: create-multi-arch-manifest
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+ - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
- label: "Annotate release workflow"
depends_on:
- - build-release-image
+ - create-multi-arch-manifest
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-11-8
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index df0bae0c9cbff8f93e3c5a238477b935eaaca9f9..c395011a244853bb89f8ad4102f4753b8726ddea 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
- --ignore=entrypoints/llm/test_generate_multiple_loras.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index bbce7a25f97d61e95e2f496aac4dbc28dbb6c2f8..0f734763f13fda95b682eb95a00b2c5b8792895f 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
@@ -46,21 +46,26 @@ function cpu_tests() {
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+ # Run kernel tests
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
+ set -e
+ pytest -x -v -s tests/kernels/test_onednn.py"
+
# Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
# Note: disable until supports V1
- # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
- # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+ # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+ # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
# Note: disable Bart until supports V1
- pytest -v -s tests/models/language/generation -m cpu_model \
+ pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
- VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
+ VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py
- pytest -v -s tests/models/language/pooling -m cpu_model
- pytest -v -s tests/models/multimodal/generation \
+ pytest -x -v -s tests/models/language/pooling -m cpu_model
+ pytest -x -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
@@ -68,35 +73,51 @@ function cpu_tests() {
# Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
- pytest -s -v \
+ pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
# Note: disable it until supports V1
# Run AWQ test
- docker exec cpu-test-"$NUMA_NODE" bash -c "
- set -e
- VLLM_USE_V1=0 pytest -s -v \
- tests/quantization/test_ipex_quant.py"
+ # docker exec cpu-test-"$NUMA_NODE" bash -c "
+ # set -e
+ # VLLM_USE_V1=0 pytest -x -s -v \
+ # tests/quantization/test_ipex_quant.py"
# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
- pytest -s -v \
+ pytest -x -s -v \
tests/lora/test_qwen2vl.py"
- # online serving
+ # online serving: tp+pp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+ server_pid=$!
+ timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+ vllm bench serve \
+ --backend vllm \
+ --dataset-name random \
+ --model meta-llama/Llama-3.2-3B-Instruct \
+ --num-prompts 20 \
+ --endpoint /v1/completions
+ kill -s SIGTERM $server_pid &'
+
+ # online serving: tp+dp
+ docker exec cpu-test-"$NUMA_NODE" bash -c '
+ set -e
+ VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+ server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
- --endpoint /v1/completions'
+ --endpoint /v1/completions
+ kill -s SIGTERM $server_pid &'
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
-timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index b571618f48c2bf2ed901285422aa2fa2aee6d558..1073a4ee30afa19b5b94f7fefea9a36b1f6d22d3 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
- && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index d55a786e41e8b63a0b230d64a67d9a7b17466aa8..505664f3aecd037f45634c5b3ea5c42785201076 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
- && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
+ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
&& python3 -m pip install --progress-bar off hf-transfer
echo "--- Python dependencies installed ---"
export VLLM_USE_V1=1
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 6256677481ae4f669b011d32c467795f791a0ca3..73f3e63fbf5f6dac30e03a6aad910cb041620eec 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -23,12 +23,15 @@ docker run \
--device /dev/dri \
-v /dev/dri/by-path:/dev/dri/by-path \
--entrypoint="" \
+ -e "HF_TOKEN=${HF_TOKEN}" \
+ -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
--name "${container_name}" \
"${image_name}" \
- sh -c '
- VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
- VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
+ bash -c '
+ set -e
+ echo $ZE_AFFINITY_MASK
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+ VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
cd tests
@@ -37,8 +40,8 @@ docker run \
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py
- pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py
+ pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
+ pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/test_serial_utils.py
pytest -v -s v1/test_utils.py
pytest -v -s v1/test_metrics_reader.py
diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh
index 209d9c4341cdd83033a92fb878ceb8e6b13d5298..740d81fb39bb0be2c73d90fedfc8701766ca393c 100755
--- a/.buildkite/scripts/tpu/cleanup_docker.sh
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
@@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
- docker volume prune -f && docker system prune --force --filter "until=72h" --all
+ docker volume prune -f && docker system prune --force --filter "until=24h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 037897e53dbef42d43fc7656c3d6caf8c18fbe9a..745f285c008ad846051f85e88972e6525869739c 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -14,8 +14,19 @@ fi
# Get the single wheel file
wheel="${wheel_files[0]}"
-# Rename 'linux' to 'manylinux1' in the wheel filename
-new_wheel="${wheel/linux/manylinux1}"
+# Detect architecture and rename 'linux' to appropriate manylinux version
+arch=$(uname -m)
+if [[ $arch == "x86_64" ]]; then
+ manylinux_version="manylinux1"
+elif [[ $arch == "aarch64" ]]; then
+ manylinux_version="manylinux2014"
+else
+ echo "Warning: Unknown architecture $arch, using manylinux1 as default"
+ manylinux_version="manylinux1"
+fi
+
+# Rename 'linux' to the appropriate manylinux version in the wheel filename
+new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 4fc8857854927938e833032f8d4e9188344a1018..55349e0ac9321df1dd343e77d0a7ee49bf80991d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -88,15 +88,6 @@ steps:
- pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-- label: Chunked Prefill Test
- mirror_hardwares: [amdexperimental]
- source_file_dependencies:
- - vllm/
- - tests/basic_correctness/test_chunked_prefill
- commands:
- - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-
- label: Core Test # 10min
mirror_hardwares: [amdexperimental]
fast_check: true
@@ -118,10 +109,9 @@ steps:
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Test (API Server) # 40min
@@ -135,7 +125,8 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
+ - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
- pytest -v -s entrypoints/test_chat_utils.py
- label: Distributed Tests (4 GPUs) # 10min
@@ -242,16 +233,34 @@ steps:
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
-- label: V1 Test
+- label: V1 Test e2e + engine
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/v1
commands:
- # split the test to avoid interference
- - pytest -v -s v1/core
+ # TODO: accuracy does not match, whether setting
+ # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+ - pytest -v -s v1/e2e
- pytest -v -s v1/engine
+
+- label: V1 Test entrypoints
+ mirror_hardwares: [amdexperimental]
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
- pytest -v -s v1/entrypoints
+
+- label: V1 Test others
+ mirror_hardwares: [amdexperimental]
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s v1/core
+ - pytest -v -s v1/executor
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
@@ -263,9 +272,6 @@ steps:
- pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_metrics_reader.py
- # TODO: accuracy does not match, whether setting
- # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- - pytest -v -s v1/e2e
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -295,15 +301,6 @@ steps:
- python3 offline_inference/basic/score.py
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
-- label: Prefix Caching Test # 9min
- mirror_hardwares: [amdexperimental]
- source_file_dependencies:
- - vllm/
- - tests/prefix_caching
- commands:
- - pytest -v -s prefix_caching
-
-
- label: Platform Tests (CUDA)
mirror_hardwares: [amdexperimental]
source_file_dependencies:
@@ -328,7 +325,7 @@ steps:
source_file_dependencies:
- vllm/lora
- tests/lora
- command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+ command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
parallelism: 4
- label: PyTorch Compilation Unit Tests
@@ -345,6 +342,7 @@ steps:
- pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py
+ - pytest -v -s compile/test_decorator.py
- label: PyTorch Fullgraph Smoke Test # 9min
mirror_hardwares: [amdexperimental]
@@ -358,6 +356,7 @@ steps:
- pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py
- pytest -v -s compile/piecewise/test_full_cudagraph.py
+ - pytest -v -s compile/piecewise/test_multiple_graphs.py
- label: PyTorch Fullgraph Test # 18min
mirror_hardwares: [amdexperimental]
@@ -404,6 +403,7 @@ steps:
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
+ - vllm/distributed/device_communicators/
commands:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
@@ -462,19 +462,17 @@ steps:
- tests/quantization
commands:
# temporary install here since we need nightly, will move to requirements/test.in
- # after torchao 0.12 release
- - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+ # after torchao 0.12 release, and pin a working version of torchao nightly here
+ - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min
mirror_hardwares: [amdexperimental]
- working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: OpenAI API correctness
mirror_hardwares: [amdexperimental]
@@ -562,6 +560,14 @@ steps:
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
+- label: Multi-Modal Processor Test
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing
+
- label: Multi-Modal Models Test (Standard)
mirror_hardwares: [amdexperimental]
torch_nightly: true
@@ -571,9 +577,7 @@ steps:
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- - pytest -v -s models/multimodal/processing
- - pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model
- - pytest -v -s models/multimodal/test_tensor_schema.py -m core_model # Needs mp_method="spawn"
+ - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Models Test (Extended) 1
@@ -584,7 +588,7 @@ steps:
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
+ - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
- label: Multi-Modal Models Test (Extended) 2
mirror_hardwares: [amdexperimental]
@@ -647,8 +651,10 @@ steps:
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/fusion.py
+ - vllm/compilation/fusion_attn.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
@@ -660,11 +666,17 @@ steps:
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+ - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
# Fusion
- pytest -v -s tests/compile/test_fusion_all_reduce.py
+ - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
+ - pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
##### 1 GPU test #####
##### multi gpus test #####
@@ -757,6 +769,11 @@ steps:
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+ - pip install -e ./plugins/prithvi_io_processor_plugin
+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
+ - pip uninstall prithvi_io_processor_plugin -y
+ # end io_processor plugins test
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
@@ -793,13 +810,14 @@ steps:
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- - pytest -v -s -x lora/test_multi_loras_with_tp.py
+ - pytest -v -s -x lora/test_llm_with_multi_loras.py
- label: Weight Loading Multiple GPU Test # 33min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
- num_gpus: 2
+ num_gpus: 2
+ optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
@@ -847,3 +865,10 @@ steps:
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+- label: Qwen MoE EP Test # optional
+ gpu: h200
+ optional: true
+ num_gpus: 2
+ commands:
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index b0dd5e99d4c7278b8af61c21f2fbd983b3aacab1..c087fd555c661e5d23ce16a19cda3c75900accbf 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -10,6 +10,7 @@
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
+/vllm/model_executor/layers/mamba @tdoublep
/vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
@@ -25,11 +26,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb @aarnphm
+/vllm/v1/attention/backends/triton_attn.py @tdoublep
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
-/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
@@ -44,6 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
+/tests/models/language/generation/test_hybrid.py @tdoublep
# Docs
/docs @hmellor
@@ -72,3 +74,15 @@ mkdocs.yaml @hmellor
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
+
+# Kernels
+/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
+/vllm/attention/ops/triton_unified_attention.py @tdoublep
+
+# ROCm related: specify owner with write access to notify AMD folks for careful code review
+/docker/Dockerfile.rocm* @gshtras
+/vllm/v1/attention/backends/rocm*.py @gshtras
+/vllm/v1/attention/backends/mla/rocm*.py @gshtras
+/vllm/attention/ops/rocm*.py @gshtras
+/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
+
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 1b30c1292df85ddcf4317994b074d16f142fbc77..8043df65d5585a946779b6ef86a2e4cb7b98effb 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -7,8 +7,6 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
## Test Result
-## (Optional) Documentation Update
-
---
Essential Elements of an Effective PR Description Checklist
@@ -17,6 +15,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
**BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions)
diff --git a/.github/scale-config.yml b/.github/scale-config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c41a3ee3eb196049de99bdeacb8e2051ec0009ca
--- /dev/null
+++ b/.github/scale-config.yml
@@ -0,0 +1,21 @@
+# scale-config.yml:
+# Powers what instance types are available for GHA auto-scaled
+# runners. Runners listed here will be available as self hosted
+# runners, configuration is directly pulled from the main branch.
+# runner_types:
+# runner_label:
+# instance_type: m4.large
+# os: linux
+# # min_available defaults to the global cfg in the ALI Terraform
+# min_available: undefined
+# # when max_available value is not defined, no max runners is enforced
+# max_available: undefined
+# disk_size: 50
+# is_ephemeral: true
+
+runner_types:
+ linux.2xlarge:
+ disk_size: 150
+ instance_type: c5.2xlarge
+ is_ephemeral: true
+ os: linux
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e0ab3872d8fa377f383b5677b3430cc636335d13
--- /dev/null
+++ b/.github/workflows/issue_autolabel.yml
@@ -0,0 +1,309 @@
+name: Label issues based on keywords
+on:
+ issues:
+ types: [opened, edited, reopened]
+permissions:
+ issues: write # needed so the workflow can add labels
+ contents: read
+concurrency:
+ group: issue-labeler-${{ github.event.issue.number }}
+ cancel-in-progress: true
+jobs:
+ add-labels:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Label issues based on keywords
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ with:
+ script: |
+ // Configuration: Add new labels and keywords here
+ const labelConfig = {
+ rocm: {
+ // Keyword search - matches whole words only (with word boundaries)
+ keywords: [
+ {
+ term: "composable kernel",
+ searchIn: "both"
+ },
+ {
+ term: "rccl",
+ searchIn: "body" // only search in body
+ },
+ {
+ term: "migraphx",
+ searchIn: "title" // only search in title
+ },
+ {
+ term: "hipgraph",
+ searchIn: "both"
+ },
+ {
+ term: "ROCm System Management Interface",
+ searchIn: "body"
+ },
+ ],
+
+ // Substring search - matches anywhere in text (partial matches)
+ substrings: [
+ {
+ term: "VLLM_ROCM_",
+ searchIn: "both"
+ },
+ {
+ term: "aiter",
+ searchIn: "title"
+ },
+ {
+ term: "rocm",
+ searchIn: "title"
+ },
+ {
+ term: "amd",
+ searchIn: "title"
+ },
+ {
+ term: "hip-",
+ searchIn: "both"
+ },
+ {
+ term: "gfx",
+ searchIn: "both"
+ },
+ {
+ term: "cdna",
+ searchIn: "both"
+ },
+ {
+ term: "rdna",
+ searchIn: "both"
+ },
+ {
+ term: "torch_hip",
+ searchIn: "body" // only in body
+ },
+ {
+ term: "_hip",
+ searchIn: "both"
+ },
+ {
+ term: "hip_",
+ searchIn: "both"
+ },
+
+ // ROCm tools and libraries
+ {
+ term: "hipify",
+ searchIn: "both"
+ },
+ ],
+
+ // Regex patterns - for complex pattern matching
+ regexPatterns: [
+ {
+ pattern: "\\bmi\\d{3}[a-z]*\\b",
+ description: "AMD GPU names (mi + 3 digits + optional letters)",
+ flags: "gi",
+ searchIn: "both" // "title", "body", or "both"
+ }
+ ],
+ },
+ };
+
+ // Helper function to create regex based on search type
+ function createSearchRegex(term, type) {
+ // Escape special regex characters in the term
+ const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+
+ switch (type) {
+ case 'keyword':
+ // Word boundary search - matches whole words only
+ return new RegExp(`\\b${escapedTerm}\\b`, "gi");
+ case 'substring':
+ // Substring search - matches anywhere in the text
+ return new RegExp(escapedTerm, "gi");
+ default:
+ throw new Error(`Unknown search type: ${type}`);
+ }
+ }
+
+ // Helper function to find matching terms in text with line information
+ function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
+ const matches = [];
+ const lines = text.split('\n');
+
+ for (const termConfig of searchTerms) {
+ let regex;
+ let term, searchIn, pattern, description, flags;
+
+ // Handle different input formats (string or object)
+ if (typeof termConfig === 'string') {
+ term = termConfig;
+ searchIn = 'both'; // default
+ } else {
+ term = termConfig.term;
+ searchIn = termConfig.searchIn || 'both';
+ pattern = termConfig.pattern;
+ description = termConfig.description;
+ flags = termConfig.flags;
+ }
+
+ // Skip if this term shouldn't be searched in the current location
+ if (searchIn !== 'both' && searchIn !== searchLocation) {
+ continue;
+ }
+
+ // Create appropriate regex
+ if (searchType === 'regex') {
+ regex = new RegExp(pattern, flags || "gi");
+ } else {
+ regex = createSearchRegex(term, searchType);
+ }
+
+ const termMatches = [];
+
+ // Check each line for matches
+ lines.forEach((line, lineIndex) => {
+ const lineMatches = line.match(regex);
+ if (lineMatches) {
+ lineMatches.forEach(match => {
+ termMatches.push({
+ match: match,
+ lineNumber: lineIndex + 1,
+ lineContent: line.trim(),
+ searchType: searchType,
+ searchLocation: searchLocation,
+ originalTerm: term || pattern,
+ description: description,
+ // Show context around the match in the line
+ context: line.length > 100 ?
+ line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
+ line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
+ : line.trim()
+ });
+ });
+ }
+ });
+
+ if (termMatches.length > 0) {
+ matches.push({
+ term: term || (description || pattern),
+ searchType: searchType,
+ searchLocation: searchLocation,
+ searchIn: searchIn,
+ pattern: pattern,
+ matches: termMatches,
+ count: termMatches.length
+ });
+ }
+ }
+
+ return matches;
+ }
+
+ // Helper function to check if label should be added
+ async function processLabel(labelName, config) {
+ const body = context.payload.issue.body || "";
+ const title = context.payload.issue.title || "";
+
+ core.notice(`Processing label: ${labelName}`);
+ core.notice(`Issue Title: "${title}"`);
+ core.notice(`Issue Body length: ${body.length} characters`);
+
+ let shouldAddLabel = false;
+ let allMatches = [];
+ let reason = '';
+
+ const keywords = config.keywords || [];
+ const substrings = config.substrings || [];
+ const regexPatterns = config.regexPatterns || [];
+
+ core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
+
+ // Search in title
+ if (title.trim()) {
+ core.notice(`Searching in title: "${title}"`);
+
+ const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
+ const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
+ const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
+
+ allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
+ }
+
+ // Search in body
+ if (body.trim()) {
+ core.notice(`Searching in body (${body.length} characters)`);
+
+ const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
+ const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
+ const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
+
+ allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
+ }
+
+ if (allMatches.length > 0) {
+ core.notice(`Found ${allMatches.length} matching term(s):`);
+
+ for (const termMatch of allMatches) {
+ const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
+ const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
+
+ if (termMatch.searchType === 'regex') {
+ core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+ } else {
+ core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+ }
+
+ // Show details for each match
+ termMatch.matches.forEach((match, index) => {
+ core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
+ if (match.description) {
+ core.notice(` Description: ${match.description}`);
+ }
+ core.notice(` Context: ${match.context}`);
+ if (match.lineContent !== match.context) {
+ core.notice(` Full line: ${match.lineContent}`);
+ }
+ });
+ }
+
+ shouldAddLabel = true;
+ const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
+ const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
+ const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
+ const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
+ const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
+ const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
+
+ reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
+ }
+
+ core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
+ core.notice(`Reason: ${reason || 'No matching terms found'}`);
+
+ if (shouldAddLabel) {
+ const existingLabels = context.payload.issue.labels.map(l => l.name);
+ if (!existingLabels.includes(labelName)) {
+ await github.rest.issues.addLabels({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ labels: [labelName],
+ });
+ core.notice(`Label "${labelName}" added. ${reason}`);
+ return true;
+ }
+ core.notice(`Label "${labelName}" already present.`);
+ return false;
+ }
+
+ core.notice(`No matching terms found for label "${labelName}".`);
+ return false;
+ }
+
+ // Process all configured labels
+ const processLabels = Object.entries(labelConfig)
+ .map(([labelName, config]) => processLabel(labelName, config));
+ const labelsAdded = await Promise.all(processLabels);
+ const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
+ core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
\ No newline at end of file
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
deleted file mode 100644
index 2b1086b7faf432e91231fe0eee723c1d993c3339..0000000000000000000000000000000000000000
--- a/.github/workflows/lint-and-deploy.yaml
+++ /dev/null
@@ -1,89 +0,0 @@
-name: Lint and Deploy Charts
-
-on: pull_request
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- lint-and-deploy:
- runs-on: ubuntu-latest
- steps:
- - name: Checkout
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- with:
- fetch-depth: 0
-
- - name: Set up Helm
- uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
- with:
- version: v3.14.4
-
- #Python is required because ct lint runs Yamale and yamllint which require Python.
- - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
- with:
- python-version: '3.13'
-
- - name: Set up chart-testing
- uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
- with:
- version: v3.10.1
-
- - name: Run chart-testing (lint)
- run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
-
- - name: Setup minio
- run: |
- docker network create vllm-net
- docker run -d -p 9000:9000 --name minio --net vllm-net \
- -e "MINIO_ACCESS_KEY=minioadmin" \
- -e "MINIO_SECRET_KEY=minioadmin" \
- -v /tmp/data:/data \
- -v /tmp/config:/root/.minio \
- minio/minio server /data
- export AWS_ACCESS_KEY_ID=minioadmin
- export AWS_SECRET_ACCESS_KEY=minioadmin
- export AWS_EC2_METADATA_DISABLED=true
- mkdir opt-125m
- cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
- aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
- aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
-
- - name: Create kind cluster
- uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
-
- - name: Build the Docker image vllm cpu
- run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
-
- - name: Configuration of docker images, network and namespace for the kind cluster
- run: |
- docker pull amazon/aws-cli:2.6.4
- kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing
- kind load docker-image vllm-cpu-env:latest --name chart-testing
- docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
- kubectl create ns ns-vllm
-
- - name: Run chart-testing (install)
- run: |
- export AWS_ACCESS_KEY_ID=minioadmin
- export AWS_SECRET_ACCESS_KEY=minioadmin
- sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
- helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
-
- - name: curl test
- run: |
- kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
- sleep 10
- CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
- --header "Content-Type: application/json" \
- --data '{
- "model": "opt-125m",
- "prompt": "San Francisco is a",
- "max_tokens": 7,
- "temperature": 0
- }'):$CODE"
- echo "$CODE"
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
deleted file mode 100644
index bfd02879965eee1fb1eead062edd21f53798e14f..0000000000000000000000000000000000000000
--- a/.github/workflows/publish.yml
+++ /dev/null
@@ -1,111 +0,0 @@
-# This workflow will upload a Python Package to Release asset
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Create Release
-
-on:
- push:
- tags:
- - v*
-
-# Needed to create release and upload assets
-permissions:
- contents: write
-
-jobs:
- release:
- # Retrieve tag and create release
- name: Create Release
- runs-on: ubuntu-latest
- outputs:
- upload_url: ${{ steps.create_release.outputs.upload_url }}
- steps:
- - name: Checkout
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
- - name: Extract branch info
- shell: bash
- run: |
- echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
-
- - name: Create Release
- id: create_release
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
- env:
- RELEASE_TAG: ${{ env.release_tag }}
- with:
- github-token: "${{ secrets.GITHUB_TOKEN }}"
- script: |
- const script = require('.github/workflows/scripts/create_release.js')
- await script(github, context, core)
-
- # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow.
- # wheel:
- # name: Build Wheel
- # runs-on: ${{ matrix.os }}
- # needs: release
-
- # strategy:
- # fail-fast: false
- # matrix:
- # os: ['ubuntu-20.04']
- # python-version: ['3.9', '3.10', '3.11', '3.12']
- # pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt.
- # cuda-version: ['11.8', '12.1']
-
- # steps:
- # - name: Checkout
- # uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
- # - name: Setup ccache
- # uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
- # with:
- # create-symlink: true
- # key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-
- # - name: Set up Linux Env
- # if: ${{ runner.os == 'Linux' }}
- # run: |
- # bash -x .github/workflows/scripts/env.sh
-
- # - name: Set up Python
- # uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
- # with:
- # python-version: ${{ matrix.python-version }}
-
- # - name: Install CUDA ${{ matrix.cuda-version }}
- # run: |
- # bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-
- # - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
- # run: |
- # bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
-
- # - name: Build wheel
- # shell: bash
- # env:
- # CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
- # run: |
- # bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
- # wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
- # asset_name=${wheel_name//"linux"/"manylinux1"}
- # echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
- # echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
-
- # - name: Upload Release Asset
- # uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
- # env:
- # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- # with:
- # upload_url: ${{ needs.release.outputs.upload_url }}
- # asset_path: ./dist/${{ env.wheel_name }}
- # asset_name: ${{ env.asset_name }}
- # asset_content_type: application/*
-
- # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
- # - name: Publish package
- # uses: pypa/gh-action-pypi-publish@release/v1.8
- # with:
- # repository-url: https://test.pypi.org/legacy/
- # password: ${{ secrets.PYPI_API_TOKEN }}
- # skip-existing: true
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 16ae1aadb96be289c4a153dda43772e3586e84cb..1ee605dc7bb0d40e3aba622cf4f18b16c878ee08 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -12,16 +12,43 @@ jobs:
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
with:
script: |
- github.rest.issues.createComment({
- owner: context.repo.owner,
- repo: context.repo.repo,
- issue_number: context.issue.number,
- body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
- '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
- 'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
- 'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
- 'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
- '🚀'
- })
+ try {
+ // Get the PR author
+ const prAuthor = context.payload.pull_request.user.login;
+
+ // Check if this is the author's first PR in this repository
+ // Use GitHub's search API to find all PRs by this author
+ const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
+ q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
+ per_page: 100
+ });
+
+ const authorPRCount = searchResults.total_count;
+
+ console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
+
+ // Only post comment if this is the first PR (only one PR by this author)
+ if (authorPRCount === 1) {
+ console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
+ await github.rest.issues.createComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
+ '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
+ 'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
+ 'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
+ 'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
+ 'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
+ 'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
+ '🚀'
+ });
+ } else {
+ console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
+ }
+ } catch (error) {
+ console.error('Error checking PR history or posting comment:', error);
+ // Don't fail the workflow, just log the error
+ }
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 612b290e88d46590f68ad2a79c7f92ab3ff6ff69..c16bdeeecd07a88ee9b8f9fb18155060a90aee31 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
- id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos
- rev: v1.34.0
+ rev: v1.35.5
hooks:
- id: typos
- repo: https://github.com/PyCQA/isort
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e64b7134e6fe97ebc4178f47404d5195221cc907..e669fda88123d129e427c2462c69fc803edc9cfb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,7 +35,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx906;gfx926;gfx928;gfx936")
@@ -50,7 +50,7 @@ set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
#
@@ -370,9 +370,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
set(MARLIN_SRCS
- "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
- "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
@@ -556,6 +554,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+ "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -574,6 +573,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+ "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
@@ -765,6 +765,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"found in CUDA target architectures")
endif()
endif()
+
+ # Only build W4A8 kernels if we are building for something compatible with sm90a
+ cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
+ set(SRCS
+ "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+
+ set_gencode_flags_for_srcs(
+ SRCS "${SRCS}"
+ CUDA_ARCHS "${W4A8_ARCHS}")
+
+ list(APPEND VLLM_EXT_SRC "${SRCS}")
+
+ message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}")
+ else()
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
+ AND W4A8_ARCHS)
+ message(STATUS "Not building W4A8 kernels as CUDA Compiler version is "
+ "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+ "later if you intend on running w4a16 quantized models on "
+ "Hopper.")
+ else()
+ message(STATUS "Not building W4A8 kernels as no compatible archs "
+ "found in CUDA target architectures")
+ endif()
+ endif()
+
# if CUDA endif
endif()
@@ -806,7 +833,9 @@ set(VLLM_MOE_EXT_SRC
"csrc/moe/moe_fused_gate.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")
- list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
+ list(APPEND VLLM_MOE_EXT_SRC
+ "csrc/moe/moe_wna16.cu"
+ "csrc/moe/grouped_topk_kernels.cu")
endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/README.md b/README.md
index 7a0b9c86e777720cd5cc632531e2cb617ae2eb03..471ec0896beaa4efeca2980d5e8f52e9d896e92d 100644
--- a/README.md
+++ b/README.md
@@ -97,7 +97,7 @@ python3 setup.py install (若调试,可使用python3 setup.py develop)
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证
-- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.10.1;
+- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.10.2rc1;
## Known Issue
- 无
diff --git a/README_ORIGIN.md b/README_ORIGIN.md
index f5738866cb6bfa3527273c3abaadeafc580342c5..d08d556a3da711fa0f2330411801a41c5b203db1 100644
--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -18,14 +18,16 @@ Easy, fast, and cheap LLM serving for everyone
*Latest News* 🔥
+- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
-- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
Previous News
+- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
diff --git a/SECURITY.md b/SECURITY.md
index 414669fb3712e316497349ddeaf898b6bbf17868..d6319cdb1ac27215cd0a78ed47a408867e3ef434 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -42,4 +42,9 @@ For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we ma
* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
+* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications
+ * Substantial internal deployment leveraging the upstream vLLM project.
+ * Established internal security teams and comprehensive compliance measures.
+ * Active and consistent contributions to the upstream vLLM project.
+
* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 1d715a193ea14e0880ba25580d4fa838a70b7cae..38072152b653b831375b46802fd9ce88d29b2b8a 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -32,6 +32,14 @@ become available.
Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:
wget http://images.cocodataset.org/zips/train2017.zip
+
+
+ | ShareGPT4Video (Video) |
+ ✅ |
+ ✅ |
+
+ git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video
+ |
| BurstGPT |
@@ -51,6 +59,12 @@ become available.
✅ |
synthetic |
+
+ | RandomMultiModal (Image/Video) |
+ 🟡 |
+ 🚧 |
+ synthetic |
+
| Prefix Repetition |
✅ |
@@ -194,6 +208,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
```bash
vllm bench serve \
--backend openai-chat \
+ --endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
@@ -230,6 +245,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
```bash
vllm bench serve \
--backend openai-chat \
+ --endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
@@ -244,6 +260,7 @@ vllm bench serve \
```bash
vllm bench serve \
--backend openai-chat \
+ --endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \
--dataset-name hf \
@@ -609,7 +626,7 @@ vllm bench serve \
--prefix-repetition-prefix-len 512 \
--prefix-repetition-suffix-len 128 \
--prefix-repetition-num-prefixes 5 \
- --prefix-repetition-output-len 128
+ --prefix-repetition-output-len 128
```
@@ -684,4 +701,102 @@ python benchmarks/benchmark_serving.py \
--endpoint /v1/chat/completion
```
+### Videos (ShareGPT4Video)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+ --model Qwen/Qwen2.5-VL-7B-Instruct \
+ --dtype bfloat16 \
+ --limit-mm-per-prompt '{"video": 1}' \
+ --allowed-local-media-path /path/to/sharegpt4video/videos
+```
+
+Send requests with videos:
+
+```bash
+python benchmarks/benchmark_serving.py \
+ --backend openai-chat \
+ --model Qwen/Qwen2.5-VL-7B-Instruct \
+ --dataset-name sharegpt \
+ --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
+ --num-prompts 100 \
+ --save-result \
+ --result-dir ~/vllm_benchmark_results \
+ --save-detailed \
+ --endpoint /v1/chat/completion
+```
+
+### Synthetic Random Images (random-mm)
+
+Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
+
+Notes:
+
+- Works only with online benchmark via the OpenAI backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
+- Video sampling is not yet implemented.
+
+Start the server (example):
+
+```bash
+vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
+ --dtype bfloat16 \
+ --max-model-len 16384 \
+ --limit-mm-per-prompt '{"image": 3, "video": 0}' \
+ --mm-processor-kwargs max_pixels=1003520
+```
+
+Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
+
+Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
+
+```bash
+vllm bench serve \
+ --backend openai-chat \
+ --model Qwen/Qwen2.5-VL-3B-Instruct \
+ --endpoint /v1/chat/completions \
+ --dataset-name random-mm \
+ --num-prompts 100 \
+ --max-concurrency 10 \
+ --random-prefix-len 25 \
+ --random-input-len 300 \
+ --random-output-len 40 \
+ --random-range-ratio 0.2 \
+ --random-mm-base-items-per-request 2 \
+ --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
+ --random-mm-bucket-config '{(224, 224, 1): 1.0}' \
+ --request-rate inf \
+ --ignore-eos \
+ --seed 42
+```
+
+The number of items per request can be controlled by passing multiple image buckets:
+
+```bash
+ --random-mm-base-items-per-request 2 \
+ --random-mm-num-mm-items-range-ratio 0.5 \
+ --random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
+ --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
+```
+
+Flags specific to `random-mm`:
+
+- `--random-mm-base-items-per-request`: base number of multimodal items per request.
+- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
+- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
+- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
+
+Behavioral notes:
+
+- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
+
+How sampling works:
+
+- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
+- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
+- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
+This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
+- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
+
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 1559ca2d92841cc33eb1921b319bce5c512f5b90..ba7c733be0b25bdef90e4fba703ecb4640fe1787 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -34,6 +34,7 @@ class RequestFuncInput:
multi_modal_content: Optional[dict | list[dict]] = None
ignore_eos: bool = False
language: Optional[str] = None
+ request_id: Optional[str] = None
@dataclass
@@ -71,6 +72,9 @@ async def async_request_tgi(
"inputs": request_func_input.prompt,
"parameters": params,
}
+ headers = None
+ if request_func_input.request_id:
+ headers = {"x-request-id": request_func_input.request_id}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
if request_func_input.ignore_eos:
@@ -82,7 +86,9 @@ async def async_request_tgi(
st = time.perf_counter()
most_recent_timestamp = st
try:
- async with session.post(url=api_url, json=payload) as response:
+ async with session.post(
+ url=api_url, json=payload, headers=headers
+ ) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
@@ -145,6 +151,9 @@ async def async_request_trt_llm(
}
if request_func_input.ignore_eos:
payload["min_length"] = request_func_input.output_len
+ headers = None
+ if request_func_input.request_id:
+ headers = {"x-request-id": request_func_input.request_id}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@@ -152,7 +161,9 @@ async def async_request_trt_llm(
st = time.perf_counter()
most_recent_timestamp = st
try:
- async with session.post(url=api_url, json=payload) as response:
+ async with session.post(
+ url=api_url, json=payload, headers=headers
+ ) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
@@ -211,6 +222,8 @@ async def async_request_deepspeed_mii(
"top_p": 1.0,
}
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+ if request_func_input.request_id:
+ headers["x-request-id"] = request_func_input.request_id
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@@ -283,6 +296,8 @@ async def async_request_openai_completions(
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+ if request_func_input.request_id:
+ headers["x-request-id"] = request_func_input.request_id
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@@ -395,6 +410,8 @@ async def async_request_openai_chat_completions(
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
+ if request_func_input.request_id:
+ headers["x-request-id"] = request_func_input.request_id
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@@ -491,6 +508,8 @@ async def async_request_openai_audio(
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
+ if request_func_input.request_id:
+ headers["x-request-id"] = request_func_input.request_id
# Send audio file
def to_bytes(y, sr):
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 572292a5aca46faad63f839a6fadf578424366ea..2ea4f9ccaff2b70d57a791de36dd9ec9ace69fe3 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -19,6 +19,7 @@ import logging
import random
from abc import ABC, abstractmethod
from collections.abc import Mapping
+from copy import deepcopy
from dataclasses import dataclass
from functools import cache
from io import BytesIO
@@ -54,6 +55,7 @@ class SampleRequest:
expected_output_len: int
multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None
lora_request: Optional[LoRARequest] = None
+ request_id: Optional[str] = None
# -----------------------------------------------------------------------------
@@ -155,7 +157,10 @@ class BenchmarkDataset(ABC):
@abstractmethod
def sample(
- self, tokenizer: PreTrainedTokenizerBase, num_requests: int
+ self,
+ tokenizer: PreTrainedTokenizerBase,
+ num_requests: int,
+ request_id_prefix: str = "",
) -> list[SampleRequest]:
"""
Abstract method to generate sample requests from the dataset.
@@ -167,6 +172,7 @@ class BenchmarkDataset(ABC):
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
for processing the dataset's text.
num_requests (int): The number of sample requests to generate.
+ request_id_prefix (str) The prefix of request_id.
Returns:
list[SampleRequest]: A list of sample requests generated from the
@@ -175,7 +181,10 @@ class BenchmarkDataset(ABC):
raise NotImplementedError("sample must be implemented in subclasses.")
def maybe_oversample_requests(
- self, requests: list[SampleRequest], num_requests: int
+ self,
+ requests: list[SampleRequest],
+ num_requests: int,
+ request_id_prefix: str = "",
) -> None:
"""
Oversamples the list of requests if its size is less than the desired
@@ -183,11 +192,18 @@ class BenchmarkDataset(ABC):
Args:
requests (List[SampleRequest]): The current list of sampled
- requests. num_requests (int): The target number of requests.
+ requests.
+ num_requests (int): The target number of requests.
+ request_id_prefix (str) The prefix of the request ids.
"""
if len(requests) < num_requests:
random.seed(self.random_seed)
- additional = random.choices(requests, k=num_requests - len(requests))
+ additional = deepcopy(
+ random.choices(requests, k=num_requests - len(requests))
+ )
+ for i in range(len(additional)):
+ req = additional[i]
+ req.request_id = request_id_prefix + str(len(requests) + i)
requests.extend(additional)
logger.info("Oversampled requests to reach %d total samples.", num_requests)
@@ -277,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
)
+def process_video(video: Any) -> Mapping[str, Any]:
+ """
+ Process a single video input and return a multimedia content dictionary.
+
+ Supports the following input types:
+
+ 1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
+ containing raw video data.
+
+ 2. String input: - Treats the string as a URL or local file path. -
+ Prepends "file://" if the string doesn't start with "http://" or
+ "file://". - Returns a dictionary with the image URL.
+
+ Raises:
+ ValueError: If the input is not a supported type.
+ """
+ if isinstance(video, dict) and "bytes" in video:
+ video_bytes = video["bytes"]
+ video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+ return {
+ "type": "video_url",
+ "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
+ }
+
+ if isinstance(video, str):
+ video_url = (
+ video if video.startswith(("http://", "file://")) else f"file://{video}"
+ )
+ return {"type": "video_url", "video_url": {"url": video_url}}
+
+ raise ValueError(
+ f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
+ )
+
+
# -----------------------------------------------------------------------------
# Random Dataset Implementation (Synthetic Data)
# -----------------------------------------------------------------------------
@@ -303,6 +354,7 @@ class RandomDataset(BenchmarkDataset):
range_ratio: float = DEFAULT_RANGE_RATIO,
input_len: int = DEFAULT_INPUT_LEN,
output_len: int = DEFAULT_OUTPUT_LEN,
+ request_id_prefix: str = "",
**kwargs,
) -> list[SampleRequest]:
# Enforce range_ratio < 1
@@ -363,8 +415,10 @@ class RandomDataset(BenchmarkDataset):
prompt=prompt,
prompt_len=total_input_len,
expected_output_len=int(output_lens[i]),
+ request_id=request_id_prefix + str(i),
)
)
+
return requests
@@ -406,9 +460,11 @@ class ShareGPTDataset(BenchmarkDataset):
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
+ request_id_prefix: str = "",
**kwargs,
) -> list:
samples: list = []
+ ind = 0
for entry in self.data:
if len(samples) >= num_requests:
break
@@ -430,9 +486,10 @@ class ShareGPTDataset(BenchmarkDataset):
skip_min_output_len_check=output_len is not None,
):
continue
- # TODO: Also support ShareGPT4Video.
if image_path := entry.get("image"):
mm_content = process_image(image_path)
+ elif video_path := entry.get("video"):
+ mm_content = process_video(video_path)
else:
mm_content = None
if enable_multimodal_chat:
@@ -444,9 +501,11 @@ class ShareGPTDataset(BenchmarkDataset):
expected_output_len=new_output_len,
lora_request=lora_request,
multi_modal_data=mm_content,
+ request_id=request_id_prefix + str(ind),
)
)
- self.maybe_oversample_requests(samples, num_requests)
+ ind += 1
+ self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
return samples
@@ -512,10 +571,11 @@ class CustomDataset(BenchmarkDataset):
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
skip_chat_template: bool = False,
+ request_id_prefix: str = "",
**kwargs,
) -> list:
sampled_requests = []
- for item in self.data:
+ for i, item in enumerate(self.data):
if len(sampled_requests) >= num_requests:
break
prompt = item["prompt"]
@@ -534,9 +594,12 @@ class CustomDataset(BenchmarkDataset):
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
+ request_id=request_id_prefix + str(i),
)
)
- self.maybe_oversample_requests(sampled_requests, num_requests)
+ self.maybe_oversample_requests(
+ sampled_requests, num_requests, request_id_prefix
+ )
return sampled_requests
@@ -578,6 +641,7 @@ class SonnetDataset(BenchmarkDataset):
input_len: int = DEFAULT_INPUT_LEN,
output_len: int = DEFAULT_OUTPUT_LEN,
return_prompt_formatted: bool = False,
+ request_id_prefix: str = "",
**kwargs,
) -> list:
# Calculate average token length for a poem line.
@@ -603,6 +667,7 @@ class SonnetDataset(BenchmarkDataset):
prefix_lines = self.data[:num_prefix_lines]
samples = []
+ ind = 0
while len(samples) < num_requests:
extra_lines = random.choices(
self.data, k=num_input_lines - num_prefix_lines
@@ -613,14 +678,17 @@ class SonnetDataset(BenchmarkDataset):
msg, add_generation_prompt=True, tokenize=False
)
prompt_len = len(tokenizer(prompt_formatted).input_ids)
+
if prompt_len <= input_len:
samples.append(
SampleRequest(
prompt=prompt_formatted if return_prompt_formatted else prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
+ request_id=request_id_prefix + str(ind),
)
)
+ ind += 1
return samples
@@ -672,6 +740,7 @@ class BurstGPTDataset(BenchmarkDataset):
num_requests: int,
max_loras: Optional[int] = None,
lora_path: Optional[str] = None,
+ request_id_prefix: str = "",
**kwargs,
) -> list[SampleRequest]:
samples = []
@@ -693,6 +762,7 @@ class BurstGPTDataset(BenchmarkDataset):
prompt_len=input_len,
expected_output_len=output_len,
lora_request=lora_req,
+ request_id=request_id_prefix + str(i),
)
)
return samples
@@ -752,12 +822,14 @@ class ConversationDataset(HuggingFaceDataset):
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
+ request_id_prefix: str = "",
**kwargs,
) -> list:
# Filter examples with at least 2 conversations
filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
sampled_requests = []
dynamic_output = output_len is None
+ ind = 0
for item in filtered_data:
if len(sampled_requests) >= num_requests:
@@ -785,9 +857,13 @@ class ConversationDataset(HuggingFaceDataset):
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=mm_content,
+ request_id=request_id_prefix + str(ind),
)
)
- self.maybe_oversample_requests(sampled_requests, num_requests)
+ ind += 1
+ self.maybe_oversample_requests(
+ sampled_requests, num_requests, request_id_prefix
+ )
return sampled_requests
@@ -814,11 +890,12 @@ class VisionArenaDataset(HuggingFaceDataset):
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
+ request_id_prefix: str = "",
**kwargs,
) -> list:
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
sampled_requests = []
- for item in self.data:
+ for i, item in enumerate(self.data):
if len(sampled_requests) >= num_requests:
break
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
@@ -838,9 +915,12 @@ class VisionArenaDataset(HuggingFaceDataset):
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=mm_content,
+ request_id=request_id_prefix + str(i),
)
)
- self.maybe_oversample_requests(sampled_requests, num_requests)
+ self.maybe_oversample_requests(
+ sampled_requests, num_requests, request_id_prefix
+ )
return sampled_requests
@@ -870,15 +950,18 @@ class InstructCoderDataset(HuggingFaceDataset):
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
+ request_id_prefix: str = "",
**kwargs,
) -> list:
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
sampled_requests = []
- for item in self.data:
+ for i, item in enumerate(self.data):
if len(sampled_requests) >= num_requests:
break
- prompt = f"{item['input']}\n\n{item['instruction']} Just output \
- the code, do not include any explanation."
+ prompt = (
+ f"{item['input']}\n\n{item['instruction']} Just output "
+ "the code, do not include any explanation."
+ )
# apply template
prompt = tokenizer.apply_chat_template(
@@ -892,9 +975,12 @@ class InstructCoderDataset(HuggingFaceDataset):
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
+ request_id=request_id_prefix + str(i),
)
)
- self.maybe_oversample_requests(sampled_requests, num_requests)
+ self.maybe_oversample_requests(
+ sampled_requests, num_requests, request_id_prefix
+ )
return sampled_requests
@@ -924,12 +1010,13 @@ class MTBenchDataset(HuggingFaceDataset):
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
+ request_id_prefix: str = "",
**kwargs,
) -> list:
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
sampled_requests = []
- for item in self.data:
+ for i, item in enumerate(self.data):
if len(sampled_requests) >= num_requests:
break
prompt = item["turns"][0]
@@ -947,9 +1034,12 @@ class MTBenchDataset(HuggingFaceDataset):
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
+ request_id=request_id_prefix + str(i),
)
)
- self.maybe_oversample_requests(sampled_requests, num_requests)
+ self.maybe_oversample_requests(
+ sampled_requests, num_requests, request_id_prefix
+ )
return sampled_requests
@@ -974,10 +1064,12 @@ class AIMODataset(HuggingFaceDataset):
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
+ request_id_prefix: str = "",
**kwargs,
) -> list:
sampled_requests = []
dynamic_output = output_len is None
+ ind = 0
for item in self.data:
if len(sampled_requests) >= num_requests:
@@ -1000,9 +1092,13 @@ class AIMODataset(HuggingFaceDataset):
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=None,
+ request_id=request_id_prefix + str(ind),
)
)
- self.maybe_oversample_requests(sampled_requests, num_requests)
+ ind += 1
+ self.maybe_oversample_requests(
+ sampled_requests, num_requests, request_id_prefix
+ )
return sampled_requests
@@ -1072,12 +1168,18 @@ class NextEditPredictionDataset(HuggingFaceDataset):
"zed-industries/zeta": _format_zeta_prompt,
}
- def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs):
+ def sample(
+ self,
+ tokenizer: PreTrainedTokenizerBase,
+ num_requests: int,
+ request_id_prefix: str = "",
+ **kwargs,
+ ):
formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
if formatting_prompt_func is None:
raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
samples = []
- for sample in self.data:
+ for i, sample in enumerate(self.data):
sample = formatting_prompt_func(sample)
samples.append(
SampleRequest(
@@ -1086,11 +1188,12 @@ class NextEditPredictionDataset(HuggingFaceDataset):
expected_output_len=len(
tokenizer(sample["expected_output"]).input_ids
),
+ request_id=request_id_prefix + str(i),
)
)
if len(samples) >= num_requests:
break
- self.maybe_oversample_requests(samples, num_requests)
+ self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
return samples
@@ -1139,6 +1242,7 @@ class ASRDataset(HuggingFaceDataset):
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
+ request_id_prefix: str = "",
**kwargs,
) -> list:
import librosa
@@ -1148,6 +1252,7 @@ class ASRDataset(HuggingFaceDataset):
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests = []
skipped = 0
+ ind = 0
for item in self.data:
if len(sampled_requests) >= num_requests:
break
@@ -1166,8 +1271,10 @@ class ASRDataset(HuggingFaceDataset):
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=mm_content,
+ request_id=request_id_prefix + str(ind),
)
)
+ ind += 1
if skipped:
logger.warning(
"%d samples discarded from dataset due to"
@@ -1175,5 +1282,7 @@ class ASRDataset(HuggingFaceDataset):
" what Whisper supports.",
skipped,
)
- self.maybe_oversample_requests(sampled_requests, num_requests)
+ self.maybe_oversample_requests(
+ sampled_requests, num_requests, request_id_prefix
+ )
return sampled_requests
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index ae38caf7290b1ce2dd9858865e42695ef3ab5452..02f5f585c0c1677db7133b994bf3f090b43448a2 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -375,11 +375,12 @@ async def benchmark(
rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
last_int_rps = current_int_rps
- prompt, prompt_len, output_len, mm_content = (
+ prompt, prompt_len, output_len, mm_content, request_id = (
request.prompt,
request.prompt_len,
request.expected_output_len,
request.multi_modal_data,
+ request.request_id,
)
req_model_id, req_model_name = model_id, model_name
if lora_modules:
@@ -397,6 +398,7 @@ async def benchmark(
multi_modal_content=mm_content,
ignore_eos=ignore_eos,
extra_body=extra_body,
+ request_id=request_id,
)
task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
tasks.append(asyncio.create_task(task))
@@ -665,6 +667,7 @@ def main(args: argparse.Namespace):
tokenizer=tokenizer,
output_len=args.custom_output_len,
skip_chat_template=args.custom_skip_chat_template,
+ request_id_prefix=args.request_id_prefix,
)
elif args.dataset_name == "sonnet":
@@ -678,6 +681,7 @@ def main(args: argparse.Namespace):
prefix_len=args.sonnet_prefix_len,
tokenizer=tokenizer,
return_prompt_formatted=False,
+ request_id_prefix=args.request_id_prefix,
)
else:
assert tokenizer.chat_template or tokenizer.default_chat_template, (
@@ -690,6 +694,7 @@ def main(args: argparse.Namespace):
prefix_len=args.sonnet_prefix_len,
tokenizer=tokenizer,
return_prompt_formatted=True,
+ request_id_prefix=args.request_id_prefix,
)
elif args.dataset_name == "hf":
@@ -751,6 +756,7 @@ def main(args: argparse.Namespace):
num_requests=args.num_prompts,
tokenizer=tokenizer,
output_len=args.hf_output_len,
+ request_id_prefix=args.request_id_prefix,
)
else:
@@ -762,10 +768,15 @@ def main(args: argparse.Namespace):
tokenizer=tokenizer,
num_requests=args.num_prompts,
output_len=args.sharegpt_output_len,
+ request_id_prefix=args.request_id_prefix,
),
"burstgpt": lambda: BurstGPTDataset(
random_seed=args.seed, dataset_path=args.dataset_path
- ).sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+ ).sample(
+ tokenizer=tokenizer,
+ num_requests=args.num_prompts,
+ request_id_prefix=args.request_id_prefix,
+ ),
"random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
tokenizer=tokenizer,
num_requests=args.num_prompts,
@@ -773,6 +784,7 @@ def main(args: argparse.Namespace):
input_len=args.random_input_len,
output_len=args.random_output_len,
range_ratio=args.random_range_ratio,
+ request_id_prefix=args.request_id_prefix,
),
}
@@ -1118,6 +1130,13 @@ def create_argument_parser():
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"and the blog: https://hao-ai-lab.github.io/blogs/distserve",
)
+ parser.add_argument(
+ "--request-id-prefix",
+ type=str,
+ required=False,
+ default="benchmark-serving",
+ help="Specify the prefix of request id.",
+ )
# group for dataset specific arguments
custom_group = parser.add_argument_group("custom dataset options")
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 7dc58f48faa907c10011c07f8de1524a777e3e54..5d96bd68cf7d276c585e3daf3a9fdad59a8e6386 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -150,7 +150,6 @@ def run_vllm(
end = time.perf_counter()
else:
assert lora_requests is None, "BeamSearch API does not support LoRA"
- prompts = [request.prompt for request in requests]
# output_len should be the same for all requests.
output_len = requests[0].expected_output_len
for request in requests:
@@ -653,8 +652,8 @@ def validate_args(args):
# https://github.com/vllm-project/vllm/issues/16222
if args.data_parallel_size > 1:
raise ValueError(
- "Data parallel is not supported in offline benchmark, \
- please use benchmark serving instead"
+ "Data parallel is not supported in offline benchmark, "
+ "please use benchmark serving instead"
)
diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9663503e9baa0907aec3e73ba1f1db85107a2ed0
--- /dev/null
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+ w8a8_block_fp8_matmul,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton as vllm_triton
+
+assert current_platform.is_cuda(), (
+ "Only support benchmarking w8a8 block fp8 kernel on CUDA device."
+)
+
+# DeepSeek-V3 weight shapes
+DEEPSEEK_V3_SHAPES = [
+ (512 + 64, 7168),
+ (2112, 7168),
+ ((128 + 64) * 128, 7168),
+ (128 * (128 + 128), 512),
+ (7168, 16384),
+ (7168, 18432),
+ (18432 * 2, 7168),
+ (24576, 1536),
+ (12288, 7168),
+ (4096, 7168),
+ (7168, 2048),
+]
+
+
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+ """Build runner function for w8a8 block fp8 matmul."""
+ factor_for_scale = 1e-2
+
+ fp8_info = torch.finfo(torch.float8_e4m3fn)
+ fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+ # Create random FP8 tensors
+ A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+ A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+ B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+ B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+ # Create scales
+ block_n, block_k = block_size[0], block_size[1]
+ n_tiles = (N + block_n - 1) // block_n
+ k_tiles = (K + block_k - 1) // block_k
+
+ As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
+ Bs = (
+ torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
+ * factor_for_scale
+ )
+
+ def run():
+ return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+
+ return run
+
+
+@vllm_triton.testing.perf_report(
+ vllm_triton.testing.Benchmark(
+ x_names=["batch_size"],
+ x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+ x_log=False,
+ line_arg="provider",
+ line_vals=["torch-bf16", "w8a8-block-fp8"],
+ line_names=["torch-bf16", "w8a8-block-fp8"],
+ ylabel="TFLOP/s (larger is better)",
+ plot_name="BF16 vs W8A8 Block FP8 GEMMs",
+ args={},
+ )
+)
+def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
+ M = batch_size
+ device = "cuda"
+
+ quantiles = [0.5, 0.2, 0.8]
+
+ if provider == "torch-bf16":
+ a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
+ b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+ )
+ else: # w8a8-block-fp8
+ run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
+ ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+ lambda: run_w8a8(), quantiles=quantiles
+ )
+
+ to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+ return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+if __name__ == "__main__":
+ block_size = (128, 128)
+
+ for N, K in DEEPSEEK_V3_SHAPES:
+ print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
+
+ print(f"TFLOP/s comparison (block_size={block_size}):")
+ benchmark_tflops.run(
+ print_data=True,
+ # show_plots=False,
+ # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
+ N=N,
+ K=K,
+ block_size=block_size,
+ )
+
+ print("\nBenchmark finished!")
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index 1d4e730f99ae911535caa4cca17122656db06a4b..a6b42406b5cb06b793cb043c2d7f24fab827e078 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -80,6 +80,11 @@ def bench_run(
a, score, topk, renormalize=False
)
+ ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+ ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
+ c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
+ c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+
def run_triton_moe(
a: torch.Tensor,
w1: torch.Tensor,
@@ -111,6 +116,10 @@ def bench_run(
w2: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
+ ab_strides1: torch.Tensor,
+ ab_strides2: torch.Tensor,
+ c_strides1: torch.Tensor,
+ c_strides2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
per_act_token: bool,
@@ -125,6 +134,10 @@ def bench_run(
topk_ids,
w1_scale,
w2_scale,
+ ab_strides1,
+ ab_strides2,
+ c_strides1,
+ c_strides2,
per_act_token,
a1_scale=None,
)
@@ -136,6 +149,10 @@ def bench_run(
w2_q: torch.Tensor,
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
+ ab_strides1: torch.Tensor,
+ ab_strides2: torch.Tensor,
+ c_strides1: torch.Tensor,
+ c_strides2: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
):
@@ -150,6 +167,10 @@ def bench_run(
topk_ids,
w1_scale,
w2_scale,
+ ab_strides1,
+ ab_strides2,
+ c_strides1,
+ c_strides2,
per_act_token,
a1_scale=None,
)
@@ -194,6 +215,10 @@ def bench_run(
w2_q,
w1_scale,
w2_scale,
+ ab_strides1,
+ ab_strides2,
+ c_strides1,
+ c_strides2,
topk_weights,
topk_ids,
)
@@ -231,6 +256,10 @@ def bench_run(
"w1_scale": w1_scale,
"w2_scale": w2_scale,
"per_act_token": per_act_token,
+ "ab_strides1": ab_strides1,
+ "ab_strides2": ab_strides2,
+ "c_strides1": c_strides1,
+ "c_strides2": c_strides2,
# cuda graph params
"cutlass_graph": cutlass_graph,
"triton_graph": triton_graph,
@@ -289,6 +318,10 @@ def bench_run(
w2_q,
w1_scale,
w2_scale,
+ ab_strides1,
+ ab_strides2,
+ c_strides1,
+ c_strides2,
topk_weights,
topk_ids,
per_act_token,
@@ -297,7 +330,7 @@ def bench_run(
results.append(
benchmark.Timer(
- stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
+ stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 975d10f2e92ec80ce0a6425923311c8c3a7cd9a6..1b1c3b321cce44d61ec6fb3e0f97f22950956978 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -253,28 +253,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
else:
assert bt.a.dtype == torch.int8
assert bt.wtype == scalar_types.uint4b8
-
- if bt.w_ch_s is not None:
- s_ch = bt.w_ch_s.to(torch.float32)
- else:
- s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device)
-
- if bt.w_tok_s is not None:
- s_tok = bt.w_tok_s.to(torch.float32)
- else:
- s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device)
-
- fn = lambda: ops.marlin_qqq_gemm(
- a=bt.a,
- b_q_weight=w_q,
- s_group=w_s,
- s_tok=s_tok,
- s_ch=s_ch,
- workspace=workspace.scratch,
- size_m=bt.a.shape[0],
- size_n=bt.w_ref.shape[1],
- size_k=bt.w_ref.shape[0],
- )
+ raise NotImplementedError("QQQ is not supported anymore")
return fn
@@ -305,6 +284,25 @@ def machete_create_bench_fn(
)
+def cutlass_w4a8_create_bench_fn(
+ bt: BenchmarkTensors, out_type=torch.dtype, schedule=None
+) -> Callable:
+ w_q = bt.w_q.t().contiguous().t() # make col major
+ w_q = ops.cutlass_encode_and_reorder_int4b(w_q)
+ # expects fp8 scales
+ w_s = ops.cutlass_pack_scale_fp8(bt.w_g_s.to(torch.float8_e4m3fn))
+
+ return lambda: ops.cutlass_w4a8_mm(
+ a=bt.a,
+ b_q=w_q,
+ b_group_scales=w_s,
+ b_group_size=bt.group_size,
+ b_channel_scales=bt.w_ch_s,
+ a_token_scales=bt.w_tok_s,
+ maybe_schedule=schedule,
+ )
+
+
# impl
# bench
@@ -406,6 +404,20 @@ def bench(
)
)
+ # cutlass w4a8
+ if types.act_type == torch.float8_e4m3fn and group_size == 128:
+ timers.append(
+ bench_fns(
+ label,
+ sub_label,
+ f"cutlass w4a8 ({name_type_string})",
+ [
+ cutlass_w4a8_create_bench_fn(bt, out_type=types.output_type)
+ for bt in benchmark_tensors
+ ],
+ )
+ )
+
if sweep_schedules:
global _SWEEP_SCHEDULES_RESULTS
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index dbe03002fcebd41898a018851df530c8ad152f1e..a5073a0f4e331c44440f2982f87437f9b58d7b59 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -489,8 +489,10 @@ class BenchmarkWorker:
)
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul.
+ block_n = block_quant_shape[0] if block_quant_shape else None
+ block_k = block_quant_shape[1] if block_quant_shape else None
op_config = get_moe_configs(
- num_experts, shard_intermediate_size // 2, dtype_str, use_nn_moe=nn_moe
+ num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k, use_nn_moe=nn_moe
)
if op_config is None:
config = get_default_config(
@@ -500,8 +502,8 @@ class BenchmarkWorker:
hidden_size,
topk,
dtype_str,
- is_marlin=False,
- use_nn_moe=nn_moe
+ block_quant_shape,
+ use_nn_moe=nn_moe,
)
else:
config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..0650cbf3cc18e1c533c0bb9da6ad5914e5a49979
--- /dev/null
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+
+import torch
+
+from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+ silu_mul_fp8_quant_deep_gemm,
+)
+from vllm.platforms import current_platform
+
+
+def benchmark(E, T, H, G=128, runs=50):
+ current_platform.seed_everything(42)
+ y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
+ tokens_per_expert = torch.randint(
+ T // 2, T, size=(E,), dtype=torch.int32, device="cuda"
+ )
+
+ # Warmup
+ for _ in range(10):
+ silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
+ torch.cuda.synchronize()
+
+ # Benchmark
+ torch.cuda.synchronize()
+ start = time.perf_counter()
+ for _ in range(runs):
+ silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
+ torch.cuda.synchronize()
+
+ avg_time = (time.perf_counter() - start) / runs * 1000
+
+ # Calculate actual work done (only count valid tokens)
+ actual_tokens = tokens_per_expert.sum().item()
+ actual_elements = actual_tokens * H
+
+ # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
+ ops_per_element = 8
+ total_ops = actual_elements * ops_per_element
+ gflops = total_ops / (avg_time / 1000) / 1e9
+
+ # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
+ input_bytes = actual_tokens * 2 * H * 2 # 2*H bfloat16 inputs
+ output_bytes = actual_tokens * H * 1 # H fp8 outputs
+ scale_bytes = actual_tokens * (H // G) * 4 # scales in float32
+ total_bytes = input_bytes + output_bytes + scale_bytes
+ memory_bw = total_bytes / (avg_time / 1000) / 1e9
+
+ return avg_time, gflops, memory_bw
+
+
+configs = [
+ (8, 32, 1024),
+ (16, 64, 2048),
+ (32, 128, 4096),
+ # DeepSeekV3 Configs
+ (256, 16, 7168),
+ (256, 32, 7168),
+ (256, 64, 7168),
+ (256, 128, 7168),
+ (256, 256, 7168),
+ (256, 512, 7168),
+ (256, 1024, 7168),
+]
+
+print(f"GPU: {torch.cuda.get_device_name()}")
+print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}")
+print("-" * 50)
+
+for E, T, H in configs:
+ try:
+ time_ms, gflops, gbps = benchmark(E, T, H)
+ print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}")
+ except Exception:
+ print(f"E={E:3d},T={T:4d},H={H:4d} FAILED")
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 77136edca45b5031a235698c0a392a82f1a9ed5f..603ce5ecf0d2c60ca4de585605fbe89f9529f61f 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -3,16 +3,17 @@
import csv
import os
-import random
from datetime import datetime
+from typing import Optional
import flashinfer
import torch
-FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+from vllm.utils import round_up
-# KV Cache Layout for TRT-LLM
-# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
+FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+FP8_DTYPE = torch.float8_e4m3fn
+FP4_DTYPE = torch.uint8
def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -26,149 +27,188 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
def benchmark_decode(
- num_seqs,
- max_seq_len,
- page_size=16,
- dtype=torch.bfloat16,
- kv_layout="HND",
- num_kv_heads=8,
- kv_cache_dtype="auto",
- head_dim=128,
- warmup=10,
- trials=20,
+ dtype: torch.dtype,
+ quant_dtypes: tuple[
+ Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
+ ],
+ batch_size: int,
+ max_seq_len: int,
+ num_heads: tuple[int, int] = (64, 8),
+ head_size: int = 128,
+ kv_layout: str = "HND",
+ block_size: int = 16,
+ warmup: int = 10,
+ trials: int = 20,
):
torch.set_default_device("cuda")
- device = "cuda"
torch.manual_seed(0)
- HEAD_GRP_SIZE = 8
- MAX_SEQ_LEN = max_seq_len
+ q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
+ q_quant_dtype = q_quant_dtype or dtype
+ kv_quant_dtype = kv_quant_dtype or dtype
+ o_quant_dtype = o_quant_dtype or dtype
- # large number to reduce kv_cache reuse
- NUM_BLOCKS = int(256000 / page_size)
+ num_qo_heads, num_kv_heads = num_heads
+ assert num_qo_heads % num_kv_heads == 0
- workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8, device=device)
+ sm_scale = float(1.0 / (head_size**0.5))
- # For decode, batch_size is num_decode_token
- num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
- sm_scale = float(1.0 / (head_dim**0.5))
- q = torch.randn(num_seqs, num_qo_heads, head_dim, device=device, dtype=dtype)
- kv_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
-
- max_kv_len = max(kv_lens)
- kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int, device=device)
- max_num_blocks_per_seq = (max_kv_len + page_size - 1) // page_size
+ # large number to reduce kv_cache reuse
+ NUM_BLOCKS = int(256000 / block_size)
+
+ kv_cache_shape = None
+ if kv_layout == "NHD":
+ kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
+ elif kv_layout == "HND":
+ kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+ else:
+ raise ValueError(f"Invalid kv_layout: {kv_layout}")
+
+ # Always using 1.0 scale to reflect the real perf in benchmarking
+ q_scale = 1.0
+ ref_query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
+ if q_quant_dtype == FP8_DTYPE:
+ query, _ = to_float8(ref_query)
+ else:
+ query = ref_query
+
+ kv_lens = torch.randint(1, max_seq_len, (batch_size,), dtype=torch.int32)
+ kv_lens[-1] = max_seq_len
+
+ seq_lens = kv_lens
+ max_seq_len = torch.max(seq_lens).item()
+
+ # Always using 1.0 scale to reflect the real perf in benchmarking
+ k_scale = v_scale = 1.0
+ ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+ if kv_quant_dtype == FP8_DTYPE:
+ kv_cache, _ = to_float8(ref_kv_cache)
+ else:
+ kv_cache = ref_kv_cache
+ max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
block_tables = torch.randint(
- 0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+ 0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
)
-
- kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
- kv_cache = torch.randn(size=kv_cache_shape, device=device, dtype=dtype)
- k_scale = v_scale = 1.0
-
- if kv_cache_dtype.startswith("fp8"):
- kv_cache, _ = to_float8(kv_cache)
-
- output_trtllm = torch.empty(q.shape, dtype=dtype)
-
- # Benchmark TRT decode
- def trt_decode():
- return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
- q,
- kv_cache,
- workspace_buffer,
- block_tables,
- kv_lens_tensor,
- max_kv_len,
- bmm1_scale=k_scale * sm_scale,
- bmm2_scale=v_scale,
- out=output_trtllm,
- )
-
- def time_fn(fn, warmup=10, trials=20):
- torch.cuda.synchronize()
- start = torch.cuda.Event(enable_timing=True)
- end = torch.cuda.Event(enable_timing=True)
- times = []
- for i in range(warmup):
- fn()
- for i in range(trials):
- start.record()
- fn()
- end.record()
- torch.cuda.synchronize()
- times.append(start.elapsed_time(end)) # ms
- return sum(times) / len(times), torch.std(torch.tensor(times))
-
- # TRT Decode
- trt_mean, trt_std = time_fn(trt_decode)
-
kv_indptr = [0]
kv_indices = []
kv_last_page_lens = []
- for i in range(num_seqs):
- seq_len = kv_lens[i]
+ for i in range(batch_size):
+ seq_len = seq_lens[i]
assert seq_len > 0
- num_blocks = (seq_len + page_size - 1) // page_size
+ num_blocks = (seq_len + block_size - 1) // block_size
kv_indices.extend(block_tables[i, :num_blocks])
kv_indptr.append(kv_indptr[-1] + num_blocks)
- kv_last_page_len = seq_len % page_size
+ kv_last_page_len = seq_len % block_size
if kv_last_page_len == 0:
- kv_last_page_len = page_size
+ kv_last_page_len = block_size
kv_last_page_lens.append(kv_last_page_len)
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-
- output_baseline = torch.empty(q.shape, dtype=dtype)
+ workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8)
wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
workspace_buffer,
kv_layout,
- use_tensor_cores=((num_qo_heads // num_kv_heads) > 4),
+ use_tensor_cores=True,
)
-
wrapper.plan(
kv_indptr,
kv_indices,
kv_last_page_lens,
num_qo_heads,
num_kv_heads,
- head_dim,
- page_size,
+ head_size,
+ block_size,
"NONE",
+ sm_scale=sm_scale,
q_data_type=dtype,
- kv_data_type=torch.float8_e4m3fn if kv_cache_dtype.startswith("fp8") else dtype,
+ kv_data_type=dtype,
)
+ def time_fn(fn, warmup=10, trials=20):
+ torch.cuda.synchronize()
+ start = torch.cuda.Event(enable_timing=True)
+ end = torch.cuda.Event(enable_timing=True)
+ times = []
+ for i in range(warmup):
+ fn()
+ for i in range(trials):
+ start.record()
+ fn()
+ end.record()
+ torch.cuda.synchronize()
+ times.append(start.elapsed_time(end)) # ms
+ return sum(times) / len(times), torch.std(torch.tensor(times))
+
+ o_scale = 1.0
+ o_sf_scale = None
+ output_baseline = torch.empty(ref_query.shape, dtype=dtype)
+ if o_quant_dtype == FP4_DTYPE:
+ o_sf_scale = 500.0
+ output_trtllm = flashinfer.utils.FP4Tensor(
+ torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
+ torch.empty(
+ (
+ round_up(query.shape[0], 128),
+ round_up(query.shape[1] * query.shape[2] // 16, 4),
+ ),
+ dtype=torch.float8_e4m3fn,
+ ),
+ )
+ else:
+ output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+
def baseline_decode():
- return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale, output_baseline)
+ return wrapper.run(
+ ref_query,
+ ref_kv_cache,
+ k_scale=k_scale,
+ v_scale=v_scale,
+ out=output_baseline,
+ )
+
+ def trtllm_decode():
+ return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+ query=query,
+ kv_cache=kv_cache,
+ workspace_buffer=workspace_buffer,
+ block_tables=block_tables,
+ seq_lens=seq_lens,
+ max_seq_len=max_seq_len,
+ bmm1_scale=q_scale * k_scale * sm_scale,
+ bmm2_scale=v_scale / o_scale,
+ o_sf_scale=o_sf_scale,
+ out=output_trtllm,
+ )
baseline_mean, baseline_std = time_fn(baseline_decode)
+ trtllm_mean, trtllm_std = time_fn(trtllm_decode)
# Calculate percentage speedup (positive means TRT is faster)
- speedup_percent = (baseline_mean - trt_mean) / baseline_mean
+ speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean
print(
- f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.3f}\t{trt_std.item():.3f}"
+ f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:.3f}\t{trtllm_std.item():.3f}"
f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}"
)
# Return results for CSV writing
return {
- "num_seqs": num_seqs,
- "trt_mean": trt_mean,
- "trt_std": trt_std.item(),
+ "batch_size": batch_size,
+ "trtllm_mean": trtllm_mean,
+ "trtllm_std": trtllm_std.item(),
"baseline_mean": baseline_mean,
"baseline_std": baseline_std.item(),
"speedup_percent": speedup_percent,
- "q_dtype": str(dtype),
- "kv_cache_dtype": kv_cache_dtype,
- "page_size": page_size,
+ "q_dtype": str(q_quant_dtype),
+ "kv_cache_dtype": str(kv_quant_dtype),
+ "output_dtype": str(o_quant_dtype),
+ "block_size": block_size,
"num_kv_heads": num_kv_heads,
- "head_dim": head_dim,
+ "head_size": head_size,
"max_seq_len": max_seq_len,
}
@@ -180,17 +220,18 @@ def write_results_to_csv(results, filename=None):
filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
fieldnames = [
- "num_seqs",
- "trt_mean",
- "trt_std",
+ "batch_size",
+ "trtllm_mean",
+ "trtllm_std",
"baseline_mean",
"baseline_std",
"speedup_percent",
"q_dtype",
"kv_cache_dtype",
- "page_size",
+ "output_dtype",
+ "block_size",
"num_kv_heads",
- "head_dim",
+ "head_size",
"max_seq_len",
]
@@ -209,45 +250,43 @@ def write_results_to_csv(results, filename=None):
if __name__ == "__main__":
- num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
+ batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256]
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
all_results = []
- print(
- "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
- "output_dtype: bfloat16"
- )
- print(
- "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
- "baseline_std\tspeedup_percent"
- )
- for max_seq_len in max_seq_lens:
- for bs in num_seqs:
- result = benchmark_decode(
- bs,
- max_seq_len,
- dtype=torch.bfloat16,
- kv_cache_dtype="auto",
- )
- all_results.append(result)
+ dtype = torch.bfloat16
+ quant_dtypes = [
+ # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
+ (None, None, None),
+ (None, FP8_DTYPE, None),
+ (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+ (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
+ ]
- print(
- "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8, "
- "output_dtype: bfloat16"
- )
- print(
- "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
- "baseline_std\tspeedup_percent"
- )
- for max_seq_len in max_seq_lens:
- for bs in num_seqs:
- result = benchmark_decode(
- bs,
- max_seq_len,
- dtype=torch.bfloat16,
- kv_cache_dtype="fp8",
- )
- all_results.append(result)
+ for quant_dtype in quant_dtypes:
+ q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype
+ q_quant_dtype = q_quant_dtype or dtype
+ kv_quant_dtype = kv_quant_dtype or dtype
+ o_quant_dtype = o_quant_dtype or dtype
+
+ print(
+ f"Running benchmark for q_dtype = {q_quant_dtype}, "
+ f"kv_cache_dtype: {kv_quant_dtype}, "
+ f"output_dtype: {o_quant_dtype}"
+ )
+ print(
+ "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t"
+ "baseline_std\tspeedup_percent"
+ )
+ for max_seq_len in max_seq_lens:
+ for bs in batch_sizes:
+ result = benchmark_decode(
+ dtype=dtype,
+ quant_dtypes=quant_dtype,
+ batch_size=bs,
+ max_seq_len=max_seq_len,
+ )
+ all_results.append(result)
# Write all results to CSV
write_results_to_csv(all_results)
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 67bd9aebbcca99023ecb575e4f45e18039b5a899..40903c6c3444f5aa3aa88e2247ad3c9c643c2cbb 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -3,16 +3,17 @@
import csv
import os
-import random
from datetime import datetime
+from typing import Optional
import flashinfer
import torch
-FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+from vllm.utils import round_up
-# KV Cache Layout for TRT-LLM
-# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
+FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
+FP8_DTYPE = torch.float8_e4m3fn
+FP4_DTYPE = torch.uint8
def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -26,84 +27,100 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
@torch.no_grad()
def benchmark_prefill(
- num_seqs,
- max_seq_len,
- page_size=16,
- dtype=torch.bfloat16,
- kv_layout="HND",
- num_kv_heads=8,
- kv_cache_dtype="auto",
- head_dim=128,
- warmup=10,
- trials=20,
+ dtype: torch.dtype,
+ quant_dtypes: tuple[
+ Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
+ ],
+ batch_size: int,
+ max_seq_len: int,
+ num_heads: tuple[int, int] = (64, 8),
+ head_size: int = 128,
+ kv_layout: str = "HND",
+ block_size: int = 16,
+ warmup: int = 10,
+ trials: int = 20,
):
torch.set_default_device("cuda")
torch.manual_seed(0)
- HEAD_GRP_SIZE = 8
- MAX_SEQ_LEN = max_seq_len
+ q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
+ q_quant_dtype = q_quant_dtype or dtype
+ kv_quant_dtype = kv_quant_dtype or dtype
+ o_quant_dtype = o_quant_dtype or dtype
- # large number to reduce kv_cache reuse
- NUM_BLOCKS = int(256000 / page_size)
+ max_q_len = max_kv_len = max_seq_len
- workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8)
+ num_qo_heads, num_kv_heads = num_heads
+ assert num_qo_heads % num_kv_heads == 0
- num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
- sm_scale = float(1.0 / (head_dim**0.5))
+ sm_scale = float(1.0 / (head_size**0.5))
- q_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
- q_lens[-1] = MAX_SEQ_LEN
- max_q_len = max(q_lens)
+ # large number to reduce kv_cache reuse
+ NUM_BLOCKS = int(256000 / block_size)
+
+ kv_cache_shape = None
+ if kv_layout == "NHD":
+ kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
+ elif kv_layout == "HND":
+ kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+ else:
+ raise ValueError(f"Invalid kv_layout: {kv_layout}")
+
+ q_lens = torch.randint(1, max_q_len, (batch_size,), dtype=torch.int32)
+ q_lens[-1] = max_q_len
q_indptr = torch.cat(
[
torch.tensor([0], dtype=torch.int32),
- torch.cumsum(
- torch.tensor(q_lens, dtype=torch.int32), dim=0, dtype=torch.int32
- ),
+ torch.cumsum(q_lens, dim=0, dtype=torch.int32),
]
)
- q = torch.randn(sum(q_lens), num_qo_heads, head_dim, dtype=dtype)
-
- kv_lens = [random.randint(0, MAX_SEQ_LEN) for _ in range(num_seqs)]
- kv_lens[-1] = MAX_SEQ_LEN
-
- seq_lens = [q_len + kv_len for q_len, kv_len in zip(q_lens, kv_lens)]
- max_seq_len = max(seq_lens)
- seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32)
- max_num_blocks_per_seq = (max_seq_len + page_size - 1) // page_size
- block_tables = torch.randint(
- 0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+ # Always using 1.0 scale to reflect the real perf in benchmarking
+ q_scale = 1.0
+ ref_query = torch.randn(
+ torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype
)
+ if q_quant_dtype == FP8_DTYPE:
+ query, _ = to_float8(ref_query)
+ else:
+ query = ref_query
- kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
- kv_cache = torch.randn(size=kv_cache_shape, dtype=dtype)
- k_scale = v_scale = 1.0
+ kv_lens = torch.randint(0, max_kv_len, (batch_size,), dtype=torch.int32)
+ kv_lens[-1] = max_kv_len
- if kv_cache_dtype.startswith("fp8"):
- kv_cache, _ = to_float8(kv_cache)
+ seq_lens = kv_lens + q_lens
+ max_seq_len = torch.max(seq_lens).item()
- output_trtllm = torch.empty(q.shape, dtype=dtype)
+ # Always using 1.0 scale to reflect the real perf in benchmarking
+ k_scale = v_scale = 1.0
+ ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+ if kv_quant_dtype == FP8_DTYPE:
+ kv_cache, _ = to_float8(ref_kv_cache)
+ else:
+ kv_cache = ref_kv_cache
+ max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+ block_tables = torch.randint(
+ 0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
+ )
kv_indptr = [0]
kv_indices = []
kv_last_page_lens = []
- for i in range(num_seqs):
+ for i in range(batch_size):
seq_len = seq_lens[i]
assert seq_len > 0
- num_blocks = (seq_len + page_size - 1) // page_size
+ num_blocks = (seq_len + block_size - 1) // block_size
kv_indices.extend(block_tables[i, :num_blocks])
kv_indptr.append(kv_indptr[-1] + num_blocks)
- kv_last_page_len = seq_len % page_size
+ kv_last_page_len = seq_len % block_size
if kv_last_page_len == 0:
- kv_last_page_len = page_size
+ kv_last_page_len = block_size
kv_last_page_lens.append(kv_last_page_len)
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-
- output_baseline = torch.empty(q.shape, dtype=dtype)
+ workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8)
wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
workspace_buffer, kv_layout
@@ -115,12 +132,12 @@ def benchmark_prefill(
kv_last_page_lens,
num_qo_heads,
num_kv_heads,
- head_dim,
- page_size,
+ head_size,
+ block_size,
causal=True,
sm_scale=sm_scale,
q_data_type=dtype,
- kv_data_type=kv_cache.dtype,
+ kv_data_type=dtype,
)
def time_fn(fn, warmup=10, trials=20):
@@ -138,52 +155,76 @@ def benchmark_prefill(
times.append(start.elapsed_time(end)) # ms
return sum(times) / len(times), torch.std(torch.tensor(times))
+ o_scale = 1.0
+ o_sf_scale = None
+ output_baseline = torch.empty(ref_query.shape, dtype=dtype)
+ if o_quant_dtype == FP4_DTYPE:
+ o_sf_scale = 500.0
+ output_trtllm = flashinfer.utils.FP4Tensor(
+ torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
+ torch.empty(
+ (
+ round_up(query.shape[0], 128),
+ round_up(query.shape[1] * query.shape[2] // 16, 4),
+ ),
+ dtype=torch.float8_e4m3fn,
+ ),
+ )
+ else:
+ output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+
def baseline_prefill():
return wrapper.run(
- q, kv_cache, k_scale=k_scale, v_scale=v_scale, out=output_baseline
+ ref_query,
+ ref_kv_cache,
+ k_scale=k_scale,
+ v_scale=v_scale,
+ out=output_baseline,
)
- def trt_prefill():
+ def trtllm_prefill():
return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
- query=q,
+ query=query,
kv_cache=kv_cache,
workspace_buffer=workspace_buffer,
block_tables=block_tables,
- seq_lens=seq_lens_tensor,
+ seq_lens=seq_lens,
max_q_len=max_q_len,
max_kv_len=max_seq_len,
- bmm1_scale=k_scale * sm_scale,
- bmm2_scale=v_scale,
- batch_size=num_seqs,
+ bmm1_scale=q_scale * k_scale * sm_scale,
+ bmm2_scale=v_scale / o_scale,
+ batch_size=batch_size,
cum_seq_lens_q=q_indptr,
cum_seq_lens_kv=kv_indptr,
+ o_sf_scale=o_sf_scale,
out=output_trtllm,
)
- trt_mean, trt_std = time_fn(trt_prefill)
baseline_mean, baseline_std = time_fn(baseline_prefill)
+ trtllm_mean, trtllm_std = time_fn(trtllm_prefill)
# Calculate percentage speedup (positive means TRT is faster)
- speedup_percent = (baseline_mean - trt_mean) / baseline_mean
+ speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean
print(
- f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.5f}\t{trt_std.item():.5f}"
- f"\t{baseline_mean:.5f}\t{baseline_std.item():.5f}\t{speedup_percent:.5f}"
+ f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:8.3f}\t{trtllm_std.item():8.3f}"
+ f"\t{baseline_mean:8.3f}\t{baseline_std.item():8.3f}\t{speedup_percent:8.3f}"
)
# Return results for CSV writing
return {
- "num_seqs": num_seqs,
- "trt_mean": trt_mean,
- "trt_std": trt_std.item(),
+ "batch_size": batch_size,
+ "trtllm_mean": trtllm_mean,
+ "trtllm_std": trtllm_std.item(),
"baseline_mean": baseline_mean,
"baseline_std": baseline_std.item(),
"speedup_percent": speedup_percent,
- "q_dtype": str(dtype),
- "kv_cache_dtype": kv_cache_dtype,
- "page_size": page_size,
+ "q_dtype": str(q_quant_dtype),
+ "kv_cache_dtype": str(kv_quant_dtype),
+ "output_dtype": str(o_quant_dtype),
+ "block_size": block_size,
"num_kv_heads": num_kv_heads,
- "head_dim": head_dim,
+ "head_size": head_size,
"max_seq_len": max_seq_len,
}
@@ -195,17 +236,18 @@ def write_results_to_csv(results, filename=None):
filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
fieldnames = [
- "num_seqs",
- "trt_mean",
- "trt_std",
+ "batch_size",
+ "trtllm_mean",
+ "trtllm_std",
"baseline_mean",
"baseline_std",
"speedup_percent",
"q_dtype",
"kv_cache_dtype",
- "page_size",
+ "output_dtype",
+ "block_size",
"num_kv_heads",
- "head_dim",
+ "head_size",
"max_seq_len",
]
@@ -224,27 +266,42 @@ def write_results_to_csv(results, filename=None):
if __name__ == "__main__":
- num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
+ batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256]
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
all_results = []
- print(
- "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
- "output_dtype: bfloat16"
- )
- print(
- "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
- "baseline_std\tspeedup_percent"
- )
- for max_seq_len in max_seq_lens:
- for bs in num_seqs:
- result = benchmark_prefill(
- bs,
- max_seq_len,
- dtype=torch.bfloat16,
- kv_cache_dtype="auto",
- )
- all_results.append(result)
+ dtype = torch.bfloat16
+ quant_dtypes = [
+ # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
+ (None, None, None),
+ (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+ (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
+ ]
+
+ for quant_dtype in quant_dtypes:
+ q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype
+ q_quant_dtype = q_quant_dtype or dtype
+ kv_quant_dtype = kv_quant_dtype or dtype
+ o_quant_dtype = o_quant_dtype or dtype
+
+ print(
+ f"Running benchmark for q_dtype = {q_quant_dtype}, "
+ f"kv_cache_dtype: {kv_quant_dtype}, "
+ f"output_dtype: {o_quant_dtype}"
+ )
+ print(
+ "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t"
+ "baseline_std\tspeedup_percent"
+ )
+ for max_seq_len in max_seq_lens:
+ for bs in batch_sizes:
+ result = benchmark_prefill(
+ dtype=dtype,
+ quant_dtypes=quant_dtype,
+ batch_size=bs,
+ max_seq_len=max_seq_len,
+ )
+ all_results.append(result)
# Write all results to CSV
write_results_to_csv(all_results)
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index 4fcdbadd65ecd366f51ce5d9053cc700854dcb71..98bde9d83c82d199d3eb3fb609b10fff28b99525 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -11,8 +11,8 @@ from datetime import datetime
from typing import Any
import torch
-import tqdm
import triton
+from tqdm import tqdm
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
_w8a8_block_fp8_matmul,
@@ -141,6 +141,7 @@ def get_weight_shapes(tp_size):
# cannot TP
total = [
(512 + 64, 7168),
+ (2112, 7168),
((128 + 64) * 128, 7168),
(128 * (128 + 128), 512),
(7168, 16384),
diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py
index a27f02394afbdfa4f3a352a599db349a2601bea5..9a057990bda5f64deada11b0beb56c0207570de5 100644
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -95,4 +95,10 @@ WEIGHT_SHAPES = {
([2048, 2816], 1),
([1408, 2048], 0),
],
+ "CohereLabs/c4ai-command-a-03-2025": [
+ ([12288, 14336], 1),
+ ([12288, 12288], 0),
+ ([12288, 73728], 1),
+ ([36864, 12288], 0),
+ ],
}
diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md
index ae0866ae607511023596b0d192d4e311ae95d42e..7adf97bcf56228916918fa54786a03e28b4abdbe 100644
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@@ -5,11 +5,13 @@ The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `re
First start serving your model
```bash
-export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
+export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
-vllm serve $MODEL_NAME --disable-log-requests
+vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests
```
+The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface).
+
## Synthetic Multi-Turn Conversations
Download the following text file (used for generation of synthetic conversations)
@@ -26,10 +28,10 @@ But you may use other text files if you prefer (using this specific file is not
Then run the benchmarking script
```bash
-export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
+export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
-python benchmark_serving_multi_turn.py --model $MODEL_NAME --input-file generate_multi_turn.json \
---num-clients 2 --max-active-conversations 6
+python benchmark_serving_multi_turn.py --model $MODEL_PATH --served-model-name Llama \
+--input-file generate_multi_turn.json --num-clients 2 --max-active-conversations 6
```
You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 53c3207491d188ddbd1f2bafd7b0f554aefcaa05..d23b7b6e4571dff6263aeb508ba5c8e4604ed786 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -825,9 +825,11 @@ def get_client_config(
# Arguments for API requests
chat_url = f"{args.url}/v1/chat/completions"
+ model_name = args.served_model_name if args.served_model_name else args.model
+
req_args = RequestArgs(
chat_url=chat_url,
- model=args.model,
+ model=model_name,
stream=not args.no_stream,
limit_min_tokens=args.limit_min_tokens,
limit_max_tokens=args.limit_max_tokens,
@@ -1247,9 +1249,19 @@ async def main() -> None:
default=0,
help="Seed for random number generators (default: 0)",
)
+
parser.add_argument(
"-m", "--model", type=str, required=True, help="Path of the LLM model"
)
+ parser.add_argument(
+ "--served-model-name",
+ type=str,
+ default=None,
+ help="The model name used in the API. "
+ "If not specified, the model name will be the "
+ "same as the ``--model`` argument. ",
+ )
+
parser.add_argument(
"-u",
"--url",
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index e0da46e2accaaea3dff79fd7e1acf57bd2b0cc84..52bfd82c7fcfed310bd925ecdbd59ba1f1140f40 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -1,6 +1,7 @@
include(FetchContent)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -182,17 +183,17 @@ endif()
#
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
# Flag to enable ACL kernels for AARCH64 platforms
-if ( VLLM_BUILD_ACL STREQUAL "ON")
+if (VLLM_BUILD_ACL STREQUAL "ON")
set(USE_ACL ON)
else()
set(USE_ACL OFF)
endif()
-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
+if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
- GIT_TAG v3.8.1
+ GIT_TAG v3.9
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
@@ -204,7 +205,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
endif()
set(ONEDNN_AARCH64_USE_ACL "ON")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
- endif()
+ endif()
set(ONEDNN_LIBRARY_TYPE "STATIC")
set(ONEDNN_BUILD_DOC "OFF")
@@ -217,38 +218,23 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
+ set(ONEDNN_VERBOSE "OFF")
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
FetchContent_MakeAvailable(oneDNN)
-
- list(APPEND LIBS dnnl)
-elseif(POWER10_FOUND)
- FetchContent_Declare(
- oneDNN
- GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
- GIT_TAG v3.7.2
- GIT_PROGRESS TRUE
- GIT_SHALLOW TRUE
+ add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
+ target_include_directories(
+ dnnl_ext
+ PUBLIC ${oneDNN_SOURCE_DIR}/include
+ PUBLIC ${oneDNN_BINARY_DIR}/include
+ PRIVATE ${oneDNN_SOURCE_DIR}/src
)
-
- set(ONEDNN_LIBRARY_TYPE "STATIC")
- set(ONEDNN_BUILD_DOC "OFF")
- set(ONEDNN_BUILD_EXAMPLES "OFF")
- set(ONEDNN_BUILD_TESTS "OFF")
- set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
- set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
- set(ONEDNN_BUILD_GRAPH "OFF")
- set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
- set(ONEDNN_ENABLE_ITT_TASKS "OFF")
- set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
- set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
- set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
-
- set(DNNL_CPU_RUNTIME "OMP")
-
- FetchContent_MakeAvailable(oneDNN)
-
- list(APPEND LIBS dnnl)
+ target_link_libraries(dnnl_ext dnnl)
+ target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
+ list(APPEND LIBS dnnl_ext)
+ set(USE_ONEDNN ON)
+else()
+ set(USE_ONEDNN OFF)
endif()
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
@@ -275,7 +261,6 @@ set(VLLM_EXT_SRC
if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC
- "csrc/cpu/quant.cpp"
"csrc/cpu/shm.cpp"
${VLLM_EXT_SRC})
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
@@ -289,14 +274,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
${VLLM_EXT_SRC})
add_compile_definitions(-DCPU_CAPABILITY_AVX512)
endif()
-elseif(POWER10_FOUND)
- set(VLLM_EXT_SRC
- "csrc/cpu/quant.cpp"
- ${VLLM_EXT_SRC})
endif()
-if (ASIMD_FOUND)
+
+if(USE_ONEDNN)
set(VLLM_EXT_SRC
- "csrc/cpu/quant.cpp"
+ "csrc/cpu/dnnl_kernels.cpp"
${VLLM_EXT_SRC})
endif()
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
index ee6768bce26ca48714361176e502a2d3f8829669..02224cfe3ee81289385f335791794186ac33ff44 100644
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -19,7 +19,7 @@ else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
- GIT_TAG 0e43e774597682284358ff2c54530757b654b8d1
+ GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
@@ -37,13 +37,14 @@ cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
set(FlashMLA_SOURCES
${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
- ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
+ ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
- ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu)
+ ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
+ ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
set(FlashMLA_INCLUDES
${flashmla_SOURCE_DIR}/csrc/cutlass/include
- ${flashmla_SOURCE_DIR}/csrc/include)
+ ${flashmla_SOURCE_DIR}/csrc)
set_gencode_flags_for_srcs(
SRCS "${FlashMLA_SOURCES}"
diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
index e0e95d06290dfd6e3df041f9b3d3ac8cb4220db3..6dd6f269f3dc955558914fe9387c0cbe30d22f32 100644
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@@ -167,7 +167,7 @@ typename T::Fmha::Arguments args_from_options(
// TODO(trevor-m): Change split_kv back to -1 when
// https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
// perform worse with larger context length and smaller batch sizes.
- num_kv_splits, // split_kv
+ static_cast(num_kv_splits), // split_kv
nullptr, // is_var_split_kv
};
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
@@ -264,7 +264,7 @@ int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_ba
// Assumes device 0 when getting sm_count.
arguments.hw_info.sm_count =
sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count;
- arguments.split_kv = num_kv_splits;
+ arguments.split_kv = static_cast(num_kv_splits);
MlaSm100Type::Fmha::set_split_kv(arguments);
return MlaSm100Type::Fmha::get_workspace_size(arguments);
diff --git a/csrc/cache.h b/csrc/cache.h
index 02049d3582f0761bf3268441d54978473d8f0151..31ec03ca49f4b4d57752b6e37ab3dffb6e615040 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -42,10 +42,35 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
const std::string& kv_cache_dtype,
torch::Tensor& scale);
+void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
+ torch::Tensor& cp_local_token_select_indices,
+ torch::Tensor& kv_cache,
+ torch::Tensor& slot_mapping,
+ const std::string& kv_cache_dtype,
+ torch::Tensor& scale);
+
// Just for unittest
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
const double scale, const std::string& kv_cache_dtype);
+
+void gather_and_maybe_dequant_cache(
+ torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+ torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
+ torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
+ torch::Tensor const& cu_seq_lens, // [BATCH+1]
+ int64_t batch_size, const std::string& kv_cache_dtype,
+ torch::Tensor const& scale,
+ std::optional seq_starts = std::nullopt);
+
+// TODO(hc): cp_gather_cache need support scaled kvcahe in the future.
+void cp_gather_cache(
+ torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+ torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
+ torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
+ torch::Tensor const& cu_seq_lens, // [BATCH+1]
+ int64_t batch_size, std::optional seq_starts = std::nullopt);
+
void read_cache(
torch::Tensor& keys,
torch::Tensor& values,
@@ -61,10 +86,3 @@ void write_cache_multi_layers(
std::vector const& value_caches,
torch::Tensor& slot_mapping,
const std::string& kv_cache_dtype);
-
-void gather_cache(
- torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
- torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
- torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
- torch::Tensor const& cu_seq_lens, // [BATCH+1]
- int64_t batch_size, std::optional seq_starts = std::nullopt);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 3132affeb4b5e2db1fdc36efa8cdc1795c87dc4d..89ac4761be44b177604cd1e8b2834fb8965d0e99 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -1,6 +1,7 @@
#include
#include
#include
+#include
#include "cuda_utils.h"
#include "cuda_compat.h"
@@ -592,6 +593,51 @@ __global__ void concat_and_cache_mla_kernel(
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
}
+template
+__global__ void cp_fused_concat_and_cache_mla_kernel(
+ const scalar_t* __restrict__ kv_c, // [num_full_tokens, kv_lora_rank]
+ const scalar_t* __restrict__ k_pe, // [num_full_tokens, pe_dim]
+ const int64_t* __restrict__ cp_local_token_select_indices, // [num_tokens]
+ cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank
+ // + pe_dim)]
+ const int64_t* __restrict__ slot_mapping, // [num_tokens]
+ const int block_stride, //
+ const int entry_stride, //
+ const int kv_c_stride, //
+ const int k_pe_stride, //
+ const int kv_lora_rank, //
+ const int pe_dim, //
+ const int block_size, //
+ const float* scale //
+) {
+ const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
+ const int64_t slot_idx = slot_mapping[blockIdx.x];
+ // NOTE: slot_idx can be -1 if the token is padded
+ if (slot_idx < 0) {
+ return;
+ }
+ const int64_t block_idx = slot_idx / block_size;
+ const int64_t block_offset = slot_idx % block_size;
+
+ auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
+ int src_stride, int dst_stride, int size, int offset) {
+ for (int i = threadIdx.x; i < size; i += blockDim.x) {
+ const int64_t src_idx = token_idx * src_stride + i;
+ const int64_t dst_idx =
+ block_idx * block_stride + block_offset * entry_stride + i + offset;
+ if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+ dst[dst_idx] = src[src_idx];
+ } else {
+ dst[dst_idx] =
+ fp8::scaled_convert(src[src_idx], *scale);
+ }
+ }
+ };
+
+ copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+ copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+}
+
} // namespace vllm
// KV_T is the data type of key and value tensors.
@@ -896,6 +942,20 @@ void write_cache_multi_layers(
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
reinterpret_cast(scale.data_ptr()));
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \
+ vllm::cp_fused_concat_and_cache_mla_kernel \
+ <<>>( \
+ reinterpret_cast(kv_c.data_ptr()), \
+ reinterpret_cast(k_pe.data_ptr()), \
+ cp_local_token_select_indices.data_ptr(), \
+ reinterpret_cast(kv_cache.data_ptr()), \
+ slot_mapping.data_ptr(), block_stride, entry_stride, \
+ kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
+ reinterpret_cast(scale.data_ptr()));
+
void concat_and_cache_mla(
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
torch::Tensor& k_pe, // [num_tokens, pe_dim]
@@ -934,6 +994,50 @@ void concat_and_cache_mla(
CALL_CONCAT_AND_CACHE_MLA);
}
+// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
+// calls into one:
+// k_c_normed.index_select(0, cp_local_token_select_indices) + \
+// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
+// concat_and_cache_mla.
+void cp_fused_concat_and_cache_mla(
+ torch::Tensor& kv_c, // [num_total_tokens, kv_lora_rank]
+ torch::Tensor& k_pe, // [num_total_tokens, pe_dim]
+ torch::Tensor& cp_local_token_select_indices, // [num_tokens]
+ torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank +
+ // pe_dim)]
+ torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
+ const std::string& kv_cache_dtype, torch::Tensor& scale) {
+ // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+ // slot_mapping.size(0) because of padding for CUDA graphs.
+ // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+ // both include padding.
+ // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+ // since key includes padding for CUDA graphs, while slot_mapping does not.
+ // In this case, slot_mapping.size(0) represents the actual number of tokens
+ // before padding.
+ // For compatibility with both cases, we use slot_mapping.size(0) as the
+ // number of tokens.
+ int num_tokens = slot_mapping.size(0);
+ int kv_lora_rank = kv_c.size(1);
+ int pe_dim = k_pe.size(1);
+ int block_size = kv_cache.size(1);
+
+ TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+
+ int kv_c_stride = kv_c.stride(0);
+ int k_pe_stride = k_pe.stride(0);
+ int block_stride = kv_cache.stride(0);
+ int entry_stride = kv_cache.stride(1);
+
+ dim3 grid(num_tokens);
+ dim3 block(std::min(kv_lora_rank, 512));
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+ CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
+}
+
namespace vllm {
template
@@ -1012,9 +1116,9 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
namespace vllm {
// grid is launched with dimensions (batch, num_splits)
-template
-__global__ void gather_cache(
- const scalar_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
+template
+__global__ void gather_and_maybe_dequant_cache(
+ const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
// ENTRIES...]
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
@@ -1022,6 +1126,7 @@ __global__ void gather_cache(
const int32_t block_size, const int32_t entry_size,
const int64_t block_table_stride, const int64_t cache_block_stride,
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
+ const float* __restrict__ scale,
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
// batch
@@ -1063,10 +1168,16 @@ __global__ void gather_cache(
if (partial_block_size) full_blocks_end -= 1;
}
- auto copy_entry = [&](const scalar_t* __restrict__ _src,
+ auto copy_entry = [&](const cache_t* __restrict__ _src,
scalar_t* __restrict__ _dst) {
- for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
- _dst[i] = _src[i];
+ for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
+ if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+ _dst[i] = static_cast(_src[i]);
+ } else {
+ _dst[i] =
+ fp8::scaled_convert(_src[i], *scale);
+ }
+ }
};
for (int pid = split_start; pid < full_blocks_end; ++pid) {
@@ -1093,8 +1204,144 @@ __global__ void gather_cache(
} // namespace vllm
// Macro to dispatch the kernel based on the data type.
-#define CALL_GATHER_CACHE(CPY_DTYPE) \
- vllm::gather_cache<<>>( \
+// SCALAR_T is the data type of the destination tensor.
+// CACHE_T is the stored data type of kv-cache.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \
+ vllm::gather_and_maybe_dequant_cache \
+ <<>>( \
+ reinterpret_cast(src_cache.data_ptr()), \
+ reinterpret_cast(dst.data_ptr()), \
+ block_table.data_ptr(), cu_seq_lens.data_ptr(), \
+ block_size, entry_size, block_table_stride, cache_block_stride, \
+ cache_entry_stride, dst_entry_stride, \
+ reinterpret_cast(scale.data_ptr()), seq_starts_ptr);
+
+// Gather sequences from the cache into the destination tensor.
+// - cu_seq_lens contains the cumulative sequence lengths for each batch
+// - block_table contains the cache block indices for each sequence
+// - Optionally, seq_starts (if provided) offsets the starting block index by
+// (seq_starts[bid] / page_size)
+void gather_and_maybe_dequant_cache(
+ torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+ torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
+ torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
+ torch::Tensor const& cu_seq_lens, // [BATCH+1]
+ int64_t batch_size, const std::string& kv_cache_dtype,
+ torch::Tensor const& scale,
+ std::optional seq_starts = std::nullopt) {
+ at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ int32_t block_size = src_cache.size(1);
+ int32_t entry_size = src_cache.flatten(2, -1).size(2);
+
+ TORCH_CHECK(block_table.dtype() == torch::kInt32,
+ "block_table must be int32");
+ TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
+ "cu_seq_lens must be int32");
+ if (seq_starts.has_value()) {
+ TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
+ "seq_starts must be int32");
+ }
+
+ TORCH_CHECK(src_cache.device() == dst.device(),
+ "src_cache and dst must be on the same device");
+ TORCH_CHECK(src_cache.device() == block_table.device(),
+ "src_cache and block_table must be on the same device");
+ TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
+ "src_cache and cu_seq_lens must be on the same device");
+ if (seq_starts.has_value()) {
+ TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
+ "src_cache and seq_starts must be on the same device");
+ }
+
+ int64_t block_table_stride = block_table.stride(0);
+ int64_t cache_block_stride = src_cache.stride(0);
+ int64_t cache_entry_stride = src_cache.stride(1);
+ int64_t dst_entry_stride = dst.stride(0);
+
+ // Decide on the number of splits based on the batch size.
+ int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+ dim3 grid(batch_size, num_splits);
+ dim3 block(1024);
+
+ const int32_t* seq_starts_ptr =
+ seq_starts.has_value() ? seq_starts.value().data_ptr() : nullptr;
+
+ DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
+}
+
+namespace vllm {
+template
+// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
+// block_size.
+__global__ void cp_gather_cache(
+ const scalar_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
+ // ENTRY_SIZE]
+ scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRY_SIZE]
+ const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
+ const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
+ const int32_t block_size, const int32_t entry_size,
+ const int64_t block_table_stride, const int64_t cache_block_stride,
+ const int64_t cache_entry_stride, const int64_t dst_entry_stride,
+ const int32_t* __restrict__ seq_starts // Optional: starting offsets per
+ // batch
+) {
+ const int64_t bid = blockIdx.x; // Batch ID
+ const int32_t num_splits = gridDim.y;
+ const int32_t split = blockIdx.y;
+ const int32_t seq_start = cu_seq_lens[bid];
+ const int32_t seq_end = cu_seq_lens[bid + 1];
+ const int32_t seq_len = seq_end - seq_start;
+ const int32_t tot_slots = seq_len;
+ const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
+
+ const int32_t split_start = split * split_slots;
+ const int32_t split_end = min((split + 1) * split_slots, tot_slots);
+
+ const bool is_active_split = (split_start < tot_slots);
+
+ if (!is_active_split) return;
+
+ // Adjust the pointer for the block_table for this batch.
+ // If seq_starts is provided, compute an offset based on it
+ const int32_t batch_offset = bid * block_table_stride;
+ int32_t offset = split_start;
+ if (seq_starts != nullptr) {
+ offset += seq_starts[bid];
+ }
+ int32_t offset_div = offset / block_size;
+ offset = offset % block_size;
+ const int32_t* batch_block_table = block_table + batch_offset;
+
+ // Adjust dst pointer based on the cumulative sequence lengths.
+ dst += seq_start * dst_entry_stride;
+
+ auto copy_entry = [&](const scalar_t* __restrict__ _src,
+ scalar_t* __restrict__ _dst) {
+ for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
+ _dst[i] = _src[i];
+ };
+
+ for (int pid = split_start; pid < split_end; ++pid) {
+ auto block_id = batch_block_table[offset_div];
+ auto block_start_ptr = src_cache + block_id * cache_block_stride;
+ auto block_dst_ptr = dst + pid * dst_entry_stride;
+ copy_entry(block_start_ptr + offset * cache_entry_stride, block_dst_ptr);
+ offset += 1;
+ // bump to next block
+ if (offset == block_size) {
+ offset_div += 1;
+ offset = 0;
+ }
+ }
+}
+} // namespace vllm
+
+// Macro to dispatch the kernel based on the data type.
+#define CALL_CP_GATHER_CACHE(CPY_DTYPE) \
+ vllm::cp_gather_cache<<>>( \
reinterpret_cast(src_cache.data_ptr()), \
reinterpret_cast(dst.data_ptr()), \
block_table.data_ptr(), cu_seq_lens.data_ptr(), \
@@ -1104,9 +1351,9 @@ __global__ void gather_cache(
// Gather sequences from the cache into the destination tensor.
// - cu_seq_lens contains the cumulative sequence lengths for each batch
// - block_table contains the cache block indices for each sequence
-// - Optionally, seq_starts (if provided) offsets the starting block index by
-// (seq_starts[bid] / page_size)
-void gather_cache(
+// - Optionally, seq_starts (if provided) offsets the starting slot index by
+// seq_starts[bid]
+void cp_gather_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
@@ -1157,11 +1404,11 @@ void gather_cache(
seq_starts.has_value() ? seq_starts.value().data_ptr() : nullptr;
if (dtype_bits == 32) {
- CALL_GATHER_CACHE(uint32_t);
+ CALL_CP_GATHER_CACHE(uint32_t);
} else if (dtype_bits == 16) {
- CALL_GATHER_CACHE(uint16_t);
+ CALL_CP_GATHER_CACHE(uint16_t);
} else if (dtype_bits == 8) {
- CALL_GATHER_CACHE(uint8_t);
+ CALL_CP_GATHER_CACHE(uint8_t);
} else {
TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
}
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 3952c43cbc727de4dcdb2de2fa447d837742d123..982f7c07a13bd33a1deab10786e58cae527d5c0d 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -89,7 +89,7 @@ struct FP16Vec16 : public Vec {
explicit FP16Vec16(const FP32Vec16&);
- void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
+ void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
void save(void* ptr, const int elem_num) const {
constexpr uint32_t M = 0xFFFFFFFF;
@@ -126,7 +126,7 @@ struct BF16Vec16 : public Vec {
explicit BF16Vec16(const FP32Vec16&);
- void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
+ void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
void save(void* ptr, const int elem_num) const {
constexpr uint32_t M = 0xFFFFFFFF;
@@ -180,8 +180,8 @@ struct BF16Vec32 : public Vec {
(__m128i)vec8_data.reg, 1)) {}
void save(void* ptr) const {
- *reinterpret_cast<__m256i*>(ptr) = reg_low;
- *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high;
+ _mm256_storeu_si256((__m256i*)ptr, reg_low);
+ _mm256_storeu_si256((__m256i*)ptr + 1, reg_high);
}
};
#endif
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3f00edb36068aee040927c49c4cf3087cead3aa
--- /dev/null
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -0,0 +1,346 @@
+#include
+#include
+
+#include "common/memory_desc.hpp"
+#include "common/memory.hpp"
+
+#include "dnnl_helper.h"
+
+static dnnl::engine& default_engine() {
+ static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
+ return engine;
+}
+
+static dnnl::stream& default_stream() {
+ static dnnl::stream stream(default_engine());
+ return stream;
+}
+
+void release_dnnl_matmul_handler(int64_t handler) {
+ DNNLMatMulPrimitiveHandler* ptr =
+ reinterpret_cast(handler);
+ delete ptr;
+}
+
+template
+class DNNLPrimitiveCache {
+ public:
+ using cache_value_t = std::pair;
+ using result_value_t = VT;
+ using container_t = std::list;
+ using value_iterator_t = typename container_t::iterator;
+ using map_t = std::unordered_map;
+ using creator_t = VT (*)();
+
+ public:
+ DNNLPrimitiveCache(size_t capacity)
+ : capacity_(capacity),
+ values_(),
+ key_to_value_(std::min(256lu, capacity)) {
+ assert(capacity > 0);
+ }
+
+ template
+ result_value_t get_or_create(const KT& key, F&& creator) {
+ std::optional value = get_value(key);
+ if (value.has_value()) {
+ return value.value()->second;
+ } else {
+ return add_value({key, creator()})->second;
+ }
+ }
+
+ size_t size() const { return values_.size(); }
+
+ private:
+ void dump_data() {
+ std::stringstream ss;
+ ss << "table_id: " << std::hex << reinterpret_cast(this) << std::dec
+ << "\n";
+ ss << "container: [";
+ for (auto&& iter : values_) {
+ ss << "(" << iter.first << ", " << std::hex
+ << reinterpret_cast(iter.second.get()) << "), " << std::dec;
+ }
+ ss << "]\n";
+
+ ss << "map: [";
+ for (auto&& iter : key_to_value_) {
+ ss << "(" << iter.first << ", " << iter.second->first << ", " << std::hex
+ << reinterpret_cast(iter.second->second.get()) << std::dec
+ << "), ";
+ }
+ ss << "]\n";
+ std::printf("%s\n", ss.str().c_str());
+ }
+
+ value_iterator_t add_value(cache_value_t&& new_value) {
+ if (size() == capacity_) {
+ cache_value_t& last_item = values_.back();
+ key_to_value_.erase(last_item.first);
+ values_.pop_back();
+ }
+
+ auto& added_value_ = values_.emplace_front(std::move(new_value));
+ key_to_value_.emplace(added_value_.first, values_.begin());
+ return values_.begin();
+ }
+
+ std::optional get_value(const KT& key) {
+ if (key_to_value_.size() > 0 && key == values_.begin()->first) {
+ return values_.begin();
+ }
+
+ auto value_map_iterator = key_to_value_.find(key);
+ if (value_map_iterator != key_to_value_.end()) {
+ values_.splice(values_.begin(), values_, value_map_iterator->second);
+ return value_map_iterator->second;
+ } else {
+ return {};
+ }
+ }
+
+ private:
+ const size_t capacity_;
+ container_t values_;
+ map_t key_to_value_;
+};
+
+DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
+ const Args& args, dnnl::memory::data_type b_type)
+ : b_n_size_(args.b_n_size),
+ b_n_stride_(args.b_n_stride),
+ b_k_size_(args.b_k_size),
+ b_k_stride_(args.b_k_stride),
+ b_type_(b_type),
+ c_type_(args.c_type),
+ runtime_memory_ptrs_(8),
+ primitive_cache_size_(args.primitive_cache_size) {
+ assert(primitive_cache_size_ > 0);
+}
+
+void DNNLMatMulPrimitiveHandler::prepack_weight(
+ void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
+ dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
+ {b_k_stride_, b_n_stride_});
+ dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
+ dnnl::memory packed_weight(b_target_mem_desc, default_engine());
+ {
+ dnnl::reorder(original_weight, packed_weight)
+ .execute(default_stream(), original_weight, packed_weight);
+ default_stream().wait();
+ }
+ memory_cache_[DNNL_ARG_WEIGHTS] = packed_weight;
+ b_target_mem_desc_ = b_target_mem_desc;
+}
+
+void DNNLMatMulPrimitiveHandler::set_runtime_memory_ptr(
+ size_t index, dnnl_memory* memory_ptr) {
+ dnnl::impl::memory_storage_t* mem_storage_ptr = memory_ptr->memory_storage();
+ dnnl_memory_desc* mem_desc = const_cast(memory_ptr->md());
+ runtime_memory_ptrs_[index] = {mem_storage_ptr, mem_desc};
+}
+
+std::pair
+DNNLMatMulPrimitiveHandler::get_runtime_memory_ptr(size_t index) {
+ return runtime_memory_ptrs_[index];
+}
+
+namespace std {
+template <>
+struct hash {
+ size_t operator()(
+ const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
+ return hash()(val.b_n_size) ^ hash()(val.b_k_size) ^
+ hash()(static_cast(val.a_qs)) ^
+ hash()(static_cast(val.b_qs)) ^ hash()(val.use_azp) ^
+ hash()(static_cast(val.c_type));
+ }
+};
+
+template <>
+struct hash {
+ size_t operator()(
+ const W8A8MatMulPrimitiveHandler::MSizeCacheKey& val) const {
+ return hash()(val.a_m_size) ^ hash()(val.use_bias) ^
+ hash()(static_cast(val.bias_type));
+ }
+};
+} // namespace std
+
+bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
+ const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
+ return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size &&
+ l.a_qs == r.a_qs && l.b_qs == r.b_qs && l.use_azp == r.use_azp &&
+ l.c_type == r.c_type;
+}
+
+bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
+ const W8A8MatMulPrimitiveHandler::MSizeCacheKey& r) {
+ return l.use_bias == r.use_bias && l.a_m_size == r.a_m_size &&
+ l.bias_type == r.bias_type;
+}
+
+static std::shared_ptr
+get_w8a8_class_primitive_cache(
+ const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
+ int64_t cache_size) {
+ static W8A8MatMulPrimitiveHandler::ClassMatmulCache cache(128);
+ assert(cache_size > 0);
+ return cache.get_or_create(key, [&]() {
+ return std::make_shared(cache_size);
+ });
+}
+
+W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
+ : DNNLMatMulPrimitiveHandler(
+ static_cast(args),
+ dnnl::memory::data_type::s8),
+ use_azp_(args.use_a_zero_point),
+ a_qs_(args.a_quantization_strategy),
+ b_qs_(args.b_quantization_strategy),
+ m_size_cache_(nullptr) {
+ assert(a_qs_ != QuantizationStrategy::PER_OUTPUT_CHANNEL);
+ assert(b_qs_ != QuantizationStrategy::PER_TOKEN);
+ if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
+ assert(!use_azp_);
+ };
+ prepack_weight(args.b_ptr,
+ create_primitive_desc(
+ MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
+ .use_bias = false,
+ .bias_type = dnnl::memory::data_type::undef},
+ true)
+ .weights_desc());
+ init_runtime_memory_cache(args);
+}
+
+void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
+ auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
+ auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
+ a_storage->set_data_handle((void*)args.a_ptr);
+ a_mem_desc->dims[0] = args.a_m_size;
+ c_storage->set_data_handle((void*)args.c_ptr);
+ c_mem_desc->dims[0] = args.a_m_size;
+
+ if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
+ auto&& [a_scale_storage, a_scale_mem_desc] = get_runtime_memory_ptr(2);
+ a_scale_storage->set_data_handle((void*)args.a_scales_ptr);
+ }
+ if (use_azp_) {
+ auto&& [a_zero_point_storage, a_zero_point_mem_desc] =
+ get_runtime_memory_ptr(3);
+ a_zero_point_storage->set_data_handle((void*)args.a_zero_points_ptr);
+ }
+
+ if (args.use_bias) {
+ auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(4);
+ bias_storage->set_data_handle((void*)args.bias_ptr);
+ }
+
+ dnnl::matmul matmul = get_matmul_cache(args);
+ matmul.execute(default_stream(), memory_cache_);
+ default_stream().wait();
+}
+
+dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
+ const MSizeCacheKey& key) {
+ if (m_size_cache_.get() == nullptr) {
+ ClassMatmulCacheKey key = {.b_n_size = b_n_size_,
+ .b_k_size = b_k_size_,
+ .a_qs = a_qs_,
+ .b_qs = b_qs_,
+ .use_azp = use_azp_,
+ .c_type = c_type_};
+ m_size_cache_ = get_w8a8_class_primitive_cache(key, primitive_cache_size_);
+ }
+
+ return m_size_cache_->get_or_create(key, [&]() {
+ dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
+ return dnnl::matmul(desc);
+ });
+}
+
+void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
+ memory_cache_[DNNL_ARG_SRC] = dnnl::memory({{1, b_k_size_},
+ dnnl::memory::data_type::s8,
+ dnnl::memory::format_tag::ab},
+ default_engine(), nullptr);
+ set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
+ memory_cache_[DNNL_ARG_DST] =
+ dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
+ default_engine(), nullptr);
+ set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
+
+ // For PER_TOKEN, scales will be applied in outside epilogue
+ if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
+ memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = dnnl::memory(
+ {{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), nullptr);
+ set_runtime_memory_ptr(
+ 2, memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC].get());
+ if (use_azp_) {
+ memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC] = dnnl::memory(
+ {{1}, dnnl::memory::data_type::s32, {1}}, default_engine(), nullptr);
+ set_runtime_memory_ptr(
+ 3, memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC].get());
+ }
+ }
+
+ if (b_qs_ == QuantizationStrategy::PER_TENSOR) {
+ memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] =
+ dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, default_engine(),
+ (void*)args.b_scales_ptr);
+ } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) {
+ memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] =
+ dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+ default_engine(), (void*)args.b_scales_ptr);
+ }
+
+ memory_cache_[DNNL_ARG_BIAS] =
+ dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
+ default_engine(), nullptr);
+ set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get());
+}
+
+dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
+ const MSizeCacheKey& key, bool first_time) {
+ dnnl::memory::desc a_md({key.a_m_size, b_k_size_},
+ dnnl::memory::data_type::s8,
+ dnnl::memory::format_tag::ab);
+ dnnl::memory::desc b_md;
+ if (first_time) {
+ b_md =
+ dnnl::memory::desc({b_k_size_, b_n_size_}, dnnl::memory::data_type::s8,
+ dnnl::memory::format_tag::any);
+ } else {
+ b_md = b_target_mem_desc_;
+ }
+ dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
+ dnnl::memory::format_tag::ab);
+
+ dnnl::primitive_attr attr;
+ // For PER_TOKEN, scales will be applied in outside epilogue
+ if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
+ attr.set_scales_mask(DNNL_ARG_SRC, 0);
+ if (use_azp_) {
+ attr.set_zero_points_mask(DNNL_ARG_SRC, 0);
+ }
+ }
+
+ if (b_qs_ == QuantizationStrategy::PER_TENSOR) {
+ attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
+ } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) {
+ attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
+ }
+
+ if (key.use_bias) {
+ // For PER_TOKEN, bias will be applied in epilogue
+ assert(a_qs_ == QuantizationStrategy::PER_TENSOR);
+ dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
+ return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
+ c_md, attr);
+ } else {
+ return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
+ attr);
+ }
+}
diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..54ceefced9e985e4b3cd20a1df17e1cbc215d57e
--- /dev/null
+++ b/csrc/cpu/dnnl_helper.h
@@ -0,0 +1,169 @@
+#ifndef DNNL_HELPER_H
+#define DNNL_HELPER_H
+
+#include
+#include
+
+#include "oneapi/dnnl/dnnl.hpp"
+
+namespace c10 {
+struct BFloat16;
+struct Half;
+} // namespace c10
+
+namespace dnnl {
+namespace impl {
+struct memory_storage_t;
+struct matmul_pd_t;
+struct matmul_desc_t;
+} // namespace impl
+} // namespace dnnl
+struct dnnl_memory_desc;
+
+template
+class DNNLPrimitiveCache;
+
+template
+struct DNNLType {
+ static constexpr dnnl::memory::data_type type =
+ dnnl::memory::data_type::undef;
+};
+
+template <>
+struct DNNLType {
+ static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
+};
+
+template <>
+struct DNNLType {
+ static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
+};
+
+template <>
+struct DNNLType {
+ static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
+};
+
+template <>
+struct DNNLType {
+ static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
+};
+
+template <>
+struct DNNLType {
+ static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
+};
+
+template
+constexpr inline dnnl::memory::data_type get_dnnl_type() {
+ return DNNLType>::type;
+}
+
+class DNNLMatMulPrimitiveHandler {
+ public:
+ virtual ~DNNLMatMulPrimitiveHandler() = default;
+
+ protected:
+ struct Args {
+ dnnl_dim_t b_n_size;
+ dnnl_dim_t b_n_stride;
+ dnnl_dim_t b_k_size;
+ dnnl_dim_t b_k_stride;
+ void* b_ptr;
+ dnnl::memory::data_type c_type;
+ size_t primitive_cache_size;
+ };
+
+ protected:
+ DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
+
+ void prepack_weight(void* original_b_ptr,
+ dnnl::memory::desc b_target_mem_desc);
+
+ void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
+
+ std::pair
+ get_runtime_memory_ptr(size_t index);
+
+ protected:
+ const dnnl_dim_t b_n_size_;
+ const dnnl_dim_t b_n_stride_;
+ const dnnl_dim_t b_k_size_;
+ const dnnl_dim_t b_k_stride_;
+ dnnl::memory::data_type b_type_;
+ dnnl::memory::data_type c_type_;
+ std::unordered_map memory_cache_;
+ std::vector>
+ runtime_memory_ptrs_;
+ dnnl::memory::desc b_target_mem_desc_;
+ int64_t primitive_cache_size_;
+};
+
+class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
+ public:
+ enum class QuantizationStrategy { PER_TOKEN, PER_TENSOR, PER_OUTPUT_CHANNEL };
+
+ struct Args : public DNNLMatMulPrimitiveHandler::Args {
+ bool use_a_zero_point;
+ QuantizationStrategy a_quantization_strategy;
+ QuantizationStrategy b_quantization_strategy;
+ float* b_scales_ptr;
+ };
+
+ struct ClassMatmulCacheKey {
+ dnnl_dim_t b_n_size;
+ dnnl_dim_t b_k_size;
+ QuantizationStrategy a_qs;
+ QuantizationStrategy b_qs;
+ bool use_azp;
+ dnnl::memory::data_type c_type;
+
+ friend bool operator==(const ClassMatmulCacheKey& l,
+ const ClassMatmulCacheKey& r);
+ };
+
+ struct MSizeCacheKey {
+ dnnl_dim_t a_m_size;
+ bool use_bias;
+ dnnl::memory::data_type bias_type;
+
+ friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r);
+ };
+
+ using MSizeCache = DNNLPrimitiveCache;
+ using ClassMatmulCache =
+ DNNLPrimitiveCache>;
+
+ struct ExecArgs : public MSizeCacheKey {
+ const int8_t* a_ptr;
+ const float* a_scales_ptr;
+ const int32_t* a_zero_points_ptr;
+ const void* bias_ptr;
+ void* c_ptr;
+ };
+
+ public:
+ W8A8MatMulPrimitiveHandler(const Args& args);
+
+ QuantizationStrategy get_input_scale_strategy() const { return a_qs_; }
+
+ bool get_input_use_zero_point() const { return use_azp_; }
+
+ void execute(ExecArgs& args);
+
+ private:
+ dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key,
+ bool first_time);
+
+ void init_runtime_memory_cache(const Args& args);
+
+ dnnl::matmul get_matmul_cache(const MSizeCacheKey& key);
+
+ private:
+ const bool use_azp_;
+ const QuantizationStrategy a_qs_;
+ const QuantizationStrategy b_qs_;
+ std::shared_ptr m_size_cache_;
+};
+
+#endif
diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp
deleted file mode 100644
index 1cb8dc5b25a66e7d677d46206a28ec158c6abe2c..0000000000000000000000000000000000000000
--- a/csrc/cpu/dnnl_helper.hpp
+++ /dev/null
@@ -1,206 +0,0 @@
-#ifndef DNNL_HELPER_HPP
-#define DNNL_HELPER_HPP
-
-#include
-#include
-
-#include "oneapi/dnnl/dnnl.hpp"
-
-namespace {
-template
-struct DNNLType {
- static constexpr dnnl::memory::data_type type =
- dnnl::memory::data_type::undef;
-};
-
-template <>
-struct DNNLType {
- static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
-};
-
-template <>
-struct DNNLType {
- static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
-};
-
-template <>
-struct DNNLType {
- static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
-};
-
-template <>
-struct DNNLType {
- static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
-};
-
-template <>
-struct DNNLType {
- static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
-};
-
-template
-constexpr inline dnnl::memory::data_type get_dnnl_type() {
- return DNNLType>::type;
-}
-}; // namespace
-
-template
-class DNNLPrimitiveHelper {
- public:
- // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
- // A: [M, K], row-major
- // B: [K, N], column-major
- // C: [M, N], row-major
- // bias: [N], row-major, optional
- // a_scales: [MS]
- // b_scales: [NS]
- // Note: Due to the limitation of oneDNN
- // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
- // not supported.
-
- template
- static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
- const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
- dnnl_dim_t K, const float* a_scales,
- const float* b_scales, dnnl_dim_t MS,
- dnnl_dim_t NS) {
- auto&& OutputType = get_dnnl_type();
- auto&& BiasType = get_dnnl_type();
-
- dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
- dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
- dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
-
- dnnl::primitive_attr attr;
- if constexpr (!InputNoScale) {
- if (MS == 1) {
- // per-tensor
- attr.set_scales_mask(DNNL_ARG_SRC, 0);
- } else {
- // per-token
- TORCH_CHECK(false, "per-token quantization is unsupported.");
- }
- }
-
- if (NS == 1) {
- // per-tensor
- attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
- } else {
- // per-channel
- attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
- }
-
- dnnl::matmul::primitive_desc matmul_pd;
-// Create memory descriptors with format_tag::any for the primitive. This
-// enables the matmul primitive to choose memory layouts for an
-// optimized primitive implementation, and these layouts may differ from the
-// ones provided by the user.
-#ifdef __aarch64__
- auto mat_src_md = dnnl::memory::desc({M, K}, dnnl::memory::data_type::s8,
- dnnl::memory::format_tag::any);
- auto mat_weights_md = dnnl::memory::desc(
- {K, N}, dnnl::memory::data_type::s8, dnnl::memory::format_tag::any);
- auto mat_dst_md =
- dnnl::memory::desc({M, N}, OutputType, dnnl::memory::format_tag::any);
- if (bias) {
- dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
- matmul_pd = dnnl::matmul::primitive_desc(default_engine(), mat_src_md,
- mat_weights_md, bias_md,
- mat_dst_md, attr);
- } else {
- matmul_pd = dnnl::matmul::primitive_desc(
- default_engine(), mat_src_md, mat_weights_md, mat_dst_md, attr);
- }
-#else
- if (bias) {
- dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
- matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
- bias_md, c_md, attr);
- } else {
- matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
- c_md, attr);
- }
-#endif
- dnnl::matmul matmul(matmul_pd);
-
- auto& engine = default_engine();
-
- dnnl::memory a_m(a_md, engine, (void*)a);
- dnnl::memory b_m(b_md, engine, (void*)b);
- dnnl::memory c_m(c_md, engine, (void*)c);
- dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
- (void*)a_scales);
- dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
- (void*)b_scales);
-
- auto& stream = default_stream();
-
- auto mat_src_mem = a_m;
- auto mat_weights_mem = b_m;
- auto mat_dst_mem = c_m;
-#ifdef __aarch64__
- if (matmul_pd.weights_desc() != b_m.get_desc()) {
- mat_weights_mem = dnnl::memory(matmul_pd.weights_desc(), engine);
- dnnl::reorder(b_m, mat_weights_mem).execute(stream, b_m, mat_weights_mem);
- }
-#endif
- if constexpr (InputNoScale) {
- if (bias) {
- dnnl::memory::desc bias_md({N}, BiasType, {1});
- dnnl::memory bias_m(bias_md, engine, (void*)bias);
- matmul.execute(
- stream, {
- {DNNL_ARG_SRC, mat_src_mem},
- {DNNL_ARG_WEIGHTS, mat_weights_mem},
- {DNNL_ARG_BIAS, bias_m},
- {DNNL_ARG_DST, mat_dst_mem},
- {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
- });
- } else {
- matmul.execute(
- stream, {
- {DNNL_ARG_SRC, mat_src_mem},
- {DNNL_ARG_WEIGHTS, mat_weights_mem},
- {DNNL_ARG_DST, mat_dst_mem},
- {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
- });
- }
- } else {
- if (bias) {
- dnnl::memory::desc bias_md({N}, BiasType, {1});
- dnnl::memory bias_m(bias_md, engine, (void*)bias);
- matmul.execute(
- stream, {
- {DNNL_ARG_SRC, mat_src_mem},
- {DNNL_ARG_WEIGHTS, mat_weights_mem},
- {DNNL_ARG_BIAS, bias_m},
- {DNNL_ARG_DST, mat_dst_mem},
- {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
- {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
- });
- } else {
- matmul.execute(
- stream, {
- {DNNL_ARG_SRC, mat_src_mem},
- {DNNL_ARG_WEIGHTS, mat_weights_mem},
- {DNNL_ARG_DST, mat_dst_mem},
- {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
- {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
- });
- }
- }
- stream.wait();
- }
-
- private:
- static dnnl::engine& default_engine() {
- static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
- return engine;
- }
-
- static dnnl::stream& default_stream() {
- static dnnl::stream stream(default_engine());
- return stream;
- }
-};
-#endif
diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..acc3b9ecde143382728a9a709081971de30ad3f5
--- /dev/null
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -0,0 +1,494 @@
+#include "cpu_types.hpp"
+#include "dnnl_helper.h"
+
+namespace {
+template
+struct KernelVecType {
+ using load_vec_type = void;
+ using cvt_vec_type = void;
+};
+
+template <>
+struct KernelVecType {
+ using load_vec_type = vec_op::FP32Vec16;
+ using cvt_vec_type = vec_op::FP32Vec16;
+};
+
+#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
+template <>
+struct KernelVecType {
+ using load_vec_type = vec_op::BF16Vec16;
+ using cvt_vec_type = vec_op::FP32Vec16;
+};
+#endif
+
+template <>
+struct KernelVecType {
+#if defined(__powerpc64__) || defined(__s390x__)
+ // Power architecture-specific vector type
+ using load_vec_type = vec_op::FP32Vec16;
+#else
+ // Fallback for other architectures
+ using load_vec_type = vec_op::FP16Vec16;
+#endif
+ using cvt_vec_type = vec_op::FP32Vec16;
+};
+
+template
+void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+ const float* scale, const int32_t* azp,
+ const int64_t num_tokens,
+ const int64_t input_stride,
+ const int64_t hidden_size) {
+ using load_vec_t = typename KernelVecType::load_vec_type;
+ using cvt_vec_t = typename KernelVecType::cvt_vec_type;
+ constexpr int64_t vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+ constexpr float i8_min =
+ static_cast(std::numeric_limits::min());
+ constexpr float i8_max =
+ static_cast(std::numeric_limits::max());
+ const cvt_vec_t inv_scale(1.0 / *scale);
+ const cvt_vec_t i8_min_vec(i8_min);
+ const cvt_vec_t i8_max_vec(i8_max);
+
+ cvt_vec_t zp_vec;
+ if constexpr (AZP) {
+ zp_vec = cvt_vec_t(static_cast(*azp));
+ }
+
+#pragma omp parallel for
+ for (int64_t i = 0; i < num_tokens; ++i) {
+ int64_t j = 0;
+ const scalar_t* input_ptr = input + i * input_stride;
+ int8_t* output_ptr = output + i * hidden_size;
+ for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+ load_vec_t elems(input_ptr + j);
+ cvt_vec_t elems_fp32(elems);
+ elems_fp32 = elems_fp32 * inv_scale;
+
+ if constexpr (AZP) {
+ elems_fp32 = elems_fp32 + zp_vec;
+ }
+
+ elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+ vec_op::INT8Vec16 elems_int8(elems_fp32);
+ elems_int8.save(output_ptr + j);
+ }
+
+ load_vec_t elems(input_ptr + j);
+ cvt_vec_t elems_fp32(elems);
+ elems_fp32 = elems_fp32 * inv_scale;
+
+ if constexpr (AZP) {
+ elems_fp32 = elems_fp32 + zp_vec;
+ }
+
+ elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+ vec_op::INT8Vec16 elems_int8(elems_fp32);
+ elems_int8.save(output_ptr + j, hidden_size - j);
+ }
+}
+
+template
+void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
+ float* scale, int32_t* azp,
+ const int64_t num_tokens,
+ const int64_t input_stride,
+ const int64_t hidden_size) {
+ using load_vec_t = typename KernelVecType::load_vec_type;
+ using cvt_vec_t = typename KernelVecType::cvt_vec_type;
+ constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+ constexpr float i8_min =
+ static_cast(std::numeric_limits::min());
+ constexpr float i8_max =
+ static_cast(std::numeric_limits::max());
+ const cvt_vec_t i8_min_vec(i8_min);
+ const cvt_vec_t i8_max_vec(i8_max);
+
+#pragma omp parallel for
+ for (int64_t i = 0; i < num_tokens; ++i) {
+ cvt_vec_t max_value(std::numeric_limits::lowest());
+ cvt_vec_t min_value(std::numeric_limits::max());
+ {
+ int64_t j = 0;
+ const scalar_t* input_ptr = input + i * input_stride;
+ for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+ load_vec_t elems(input_ptr + j);
+ cvt_vec_t elems_fp32(elems);
+ if constexpr (AZP) {
+ max_value = max_value.max(elems_fp32);
+ min_value = min_value.min(elems_fp32);
+ } else {
+ max_value = max_value.max(elems_fp32.abs());
+ }
+ }
+
+ load_vec_t elems(input_ptr + j);
+ cvt_vec_t elems_fp32(elems);
+
+ if (j + vec_elem_num == hidden_size) {
+ if constexpr (AZP) {
+ max_value = max_value.max(elems_fp32);
+ min_value = min_value.min(elems_fp32);
+ } else {
+ max_value = max_value.max(elems_fp32.abs());
+ }
+ } else {
+ if constexpr (AZP) {
+ max_value = max_value.max(elems_fp32, hidden_size - j);
+ min_value = min_value.min(elems_fp32, hidden_size - j);
+ } else {
+ max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
+ }
+ }
+ }
+
+ float scale_val, azp_val;
+ if constexpr (AZP) {
+ float max_scalar = max_value.reduce_max();
+ float min_scalar = min_value.reduce_min();
+ scale_val = (max_scalar - min_scalar) / 255.0f;
+ azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
+ azp[i] = azp_val;
+ scale[i] = scale_val;
+ } else {
+ scale_val = max_value.reduce_max() / 127.0f;
+ scale[i] = scale_val;
+ }
+
+ const cvt_vec_t inv_scale(1.0 / scale_val);
+ const cvt_vec_t azp_vec(azp_val);
+
+ {
+ int64_t j = 0;
+ const scalar_t* input_ptr = input + i * input_stride;
+ int8_t* output_ptr = output + i * hidden_size;
+ for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
+ load_vec_t elems(input_ptr + j);
+ cvt_vec_t elems_fp32(elems);
+ elems_fp32 = (elems_fp32 * inv_scale);
+
+ if constexpr (AZP) {
+ elems_fp32 = elems_fp32 + azp_vec;
+ }
+ elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+ vec_op::INT8Vec16 elems_int8(elems_fp32);
+ elems_int8.save(output_ptr + j);
+ }
+
+ load_vec_t elems(input_ptr + j);
+ cvt_vec_t elems_fp32(elems);
+ elems_fp32 = (elems_fp32 * inv_scale);
+
+ if constexpr (AZP) {
+ elems_fp32 = elems_fp32 + azp_vec;
+ }
+ elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
+ vec_op::INT8Vec16 elems_int8(elems_fp32);
+ elems_int8.save(output_ptr + j, hidden_size - j);
+ }
+ }
+}
+
+template
+void dynamic_quant_epilogue(const float* input, scalar_t* output,
+ const float* a_scale, const int32_t* azp,
+ const float* azp_adj, const scalar_t* bias,
+ const int64_t num_tokens,
+ const int64_t hidden_size) {
+ CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
+ using load_vec_t = typename KernelVecType::load_vec_type;
+ using cvt_vec_t = typename KernelVecType::cvt_vec_type;
+ constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
+
+ const int64_t thread_num = omp_get_max_threads();
+ if (num_tokens > thread_num) {
+#pragma omp parallel for
+ for (int64_t i = 0; i < num_tokens; ++i) {
+ const float* input_ptr = input + i * hidden_size;
+ scalar_t* output_ptr = output + i * hidden_size;
+ int64_t j = 0;
+ cvt_vec_t token_scale_vec(a_scale[i]);
+ cvt_vec_t token_zp_scale_vec;
+ if constexpr (AZP) {
+ float zp_scale_val = a_scale[i] * static_cast(azp[i]);
+ token_zp_scale_vec = cvt_vec_t(zp_scale_val);
+ }
+ for (; j < hidden_size - vec_elem_num; ++j) {
+ cvt_vec_t elems_fp32(input_ptr + j);
+ elems_fp32 = elems_fp32 * token_scale_vec;
+ if constexpr (AZP) {
+ cvt_vec_t azp_adj_fp32(azp_adj + j);
+ elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
+ }
+ if constexpr (Bias) {
+ load_vec_t bias_vec(bias + j);
+ cvt_vec_t bias_vec_fp32(bias_vec);
+ elems_fp32 = elems_fp32 + bias_vec_fp32;
+ }
+ load_vec_t elems_out(elems_fp32);
+ elems_out.save(output_ptr + j);
+ }
+ cvt_vec_t elems_fp32(input_ptr + j);
+ elems_fp32 = elems_fp32 * token_scale_vec;
+ if constexpr (AZP) {
+ cvt_vec_t azp_adj_fp32(azp_adj + j);
+ elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
+ }
+ if constexpr (Bias) {
+ load_vec_t bias_vec(bias + j);
+ cvt_vec_t bias_vec_fp32(bias_vec);
+ elems_fp32 = elems_fp32 + bias_vec_fp32;
+ }
+ load_vec_t elems_out(elems_fp32);
+ elems_out.save(output_ptr + j, hidden_size - j);
+ }
+ } else {
+ const int64_t vec_iteration =
+ (hidden_size + vec_elem_num - 1) / vec_elem_num;
+ const int64_t vec_iteration_per_thread =
+ (vec_iteration + thread_num - 1) / thread_num;
+ const int64_t elem_num_per_thread = vec_iteration_per_thread * vec_elem_num;
+#pragma omp parallel for schedule(static, 1)
+ for (int64_t i = 0; i < thread_num; ++i) {
+ const int64_t start = elem_num_per_thread * i;
+ const int64_t end = std::min(hidden_size, elem_num_per_thread + start);
+ for (int64_t j = 0; j < num_tokens; ++j) {
+ cvt_vec_t token_scale_vec(a_scale[j]);
+ cvt_vec_t token_zp_scale_vec;
+ if constexpr (AZP) {
+ float zp_scale_val = a_scale[j] * static_cast(azp[j]);
+ token_zp_scale_vec = cvt_vec_t(zp_scale_val);
+ }
+ int64_t k = start;
+ const float* input_ptr = input + j * hidden_size;
+ scalar_t* output_ptr = output + j * hidden_size;
+ for (; k < end - vec_elem_num; k += vec_elem_num) {
+ cvt_vec_t elems_fp32(input_ptr + k);
+ elems_fp32 = elems_fp32 * token_scale_vec;
+ if constexpr (AZP) {
+ cvt_vec_t azp_adj_fp32(azp_adj + k);
+ elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
+ }
+ if constexpr (Bias) {
+ load_vec_t bias_vec(bias + k);
+ cvt_vec_t bias_vec_fp32(bias_vec);
+ elems_fp32 = elems_fp32 + bias_vec_fp32;
+ }
+ load_vec_t elems_out(elems_fp32);
+ elems_out.save(output_ptr + k);
+ }
+ if (k < end) {
+ cvt_vec_t elems_fp32(input_ptr + k);
+ elems_fp32 = elems_fp32 * token_scale_vec;
+ if constexpr (AZP) {
+ cvt_vec_t azp_adj_fp32(azp_adj + k);
+ elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
+ }
+ if constexpr (Bias) {
+ load_vec_t bias_vec(bias + k);
+ cvt_vec_t bias_vec_fp32(bias_vec);
+ elems_fp32 = elems_fp32 + bias_vec_fp32;
+ }
+ load_vec_t elems_out(elems_fp32);
+ elems_out.save(output_ptr + k, end - k);
+ }
+ }
+ }
+ }
+}
+} // namespace
+
+int64_t create_onednn_scaled_mm_handler(
+ const torch::Tensor& b, // [IC, OC], column-major
+ const torch::Tensor& b_scales, // [1] or [OC]
+ at::ScalarType output_type, bool dynamic_act_quant, bool use_azp,
+ int64_t primitive_cache_size) {
+ TORCH_CHECK(b.dim() == 2);
+ TORCH_CHECK(b.stride(0) == 1); // Column-major
+ TORCH_CHECK(b_scales.is_contiguous());
+
+ W8A8MatMulPrimitiveHandler::Args args;
+ args.primitive_cache_size = primitive_cache_size;
+
+ if (b_scales.numel() == 1) {
+ args.b_quantization_strategy =
+ W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR;
+ } else {
+ TORCH_CHECK_EQ(b_scales.numel(), b.size(1));
+ args.b_quantization_strategy =
+ W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_OUTPUT_CHANNEL;
+ }
+ args.b_scales_ptr = b_scales.data_ptr();
+ args.b_k_size = b.size(0);
+ args.b_k_stride = b.stride(0);
+ args.b_n_size = b.size(1);
+ args.b_n_stride = b.stride(1);
+ args.b_ptr = b.data_ptr();
+
+ if (dynamic_act_quant) {
+ // dynamic per-token, bias, A scales and A zps will be applied in outside.
+ args.a_quantization_strategy =
+ W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN;
+ args.use_a_zero_point = false;
+ } else {
+ // static per-tensor
+ args.a_quantization_strategy =
+ W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR;
+ args.use_a_zero_point = use_azp;
+ }
+
+ VLLM_DISPATCH_FLOATING_TYPES(output_type, "create_onednn_scaled_mm_handler",
+ [&] {
+ if (dynamic_act_quant) {
+ args.c_type = get_dnnl_type();
+ } else {
+ args.c_type = get_dnnl_type();
+ }
+ });
+
+ return reinterpret_cast(new W8A8MatMulPrimitiveHandler(args));
+}
+
+void onednn_scaled_mm(
+ torch::Tensor& c, // [M, OC], row-major
+ const torch::Tensor& a, // [M, IC], row-major
+ const torch::Tensor& a_scales, // [M] or [1]
+ const std::optional& azp, // [M] or [1]
+ const std::optional& azp_adj, // [M] or [1]
+ const std::optional& bias, // [N]
+ int64_t handler) {
+ CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
+ TORCH_CHECK(a.dim() == 2);
+ TORCH_CHECK(a.is_contiguous());
+ TORCH_CHECK(c.is_contiguous());
+ W8A8MatMulPrimitiveHandler* ptr =
+ reinterpret_cast(handler);
+ const int32_t* azp_ptr = nullptr;
+ if (azp.has_value()) {
+ azp_ptr = azp->data_ptr();
+ }
+ if (ptr->get_input_scale_strategy() ==
+ W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) {
+ TORCH_CHECK_EQ(a_scales.numel(), 1);
+ }
+
+ W8A8MatMulPrimitiveHandler::ExecArgs exec_args;
+ exec_args.a_ptr = a.data_ptr();
+ exec_args.a_m_size = a.size(0);
+ exec_args.bias_ptr = nullptr;
+ exec_args.use_bias = false;
+ exec_args.a_scales_ptr = nullptr;
+ exec_args.a_zero_points_ptr = nullptr;
+
+ VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "onednn_scaled_mm", [&] {
+ if (ptr->get_input_scale_strategy() ==
+ W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) {
+ if (bias.has_value()) {
+ exec_args.bias_ptr = bias->data_ptr();
+ exec_args.bias_type = get_dnnl_type();
+ exec_args.use_bias = true;
+ }
+ exec_args.a_scales_ptr = a_scales.data_ptr();
+ exec_args.a_zero_points_ptr = azp_ptr;
+ exec_args.c_ptr = c.data_ptr();
+ ptr->execute(exec_args);
+ } else if (ptr->get_input_scale_strategy() ==
+ W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN) {
+ torch::Tensor tmp_fp32_out =
+ torch::empty_like(c, ::at::ScalarType::Float);
+ exec_args.c_ptr = tmp_fp32_out.data_ptr();
+ ptr->execute(exec_args);
+ if (bias.has_value()) {
+ if (azp.has_value()) {
+ dynamic_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ a_scales.data_ptr(), azp_ptr, azp_adj->data_ptr(),
+ bias->data_ptr(), c.size(0), c.size(1));
+ } else {
+ dynamic_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ a_scales.data_ptr(), azp_ptr, nullptr,
+ bias->data_ptr(), c.size(0), c.size(1));
+ }
+ } else {
+ if (azp.has_value()) {
+ dynamic_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ a_scales.data_ptr(), azp_ptr, azp_adj->data_ptr(),
+ (scalar_t*)nullptr, c.size(0), c.size(1));
+ } else {
+ dynamic_quant_epilogue(
+ tmp_fp32_out.data_ptr(), c.data_ptr(),
+ a_scales.data_ptr(), azp_ptr, nullptr, (scalar_t*)nullptr,
+ c.size(0), c.size(1));
+ }
+ }
+ } else {
+ TORCH_CHECK(false, "invalid act quant type.");
+ }
+ });
+}
+
+// static-per-tensor quantization.
+void static_scaled_int8_quant(
+ torch::Tensor& out, // [batch, hidden_size]
+ const torch::Tensor& input, // [batch, hidden_size]
+ const torch::Tensor& scale, std::optional const& azp) {
+ CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
+ TORCH_CHECK(out.is_contiguous());
+ TORCH_CHECK_EQ(input.dim(), 2);
+ TORCH_CHECK_EQ(input.stride(1), 1);
+ TORCH_CHECK(scale.numel() == 1);
+ TORCH_CHECK(!azp.has_value() || azp->numel() == 1);
+
+ const int64_t stride = input.stride(0);
+ const int64_t hidden_size = input.size(1);
+ const int64_t num_tokens = input.size(0);
+ VLLM_DISPATCH_FLOATING_TYPES(
+ input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
+ if (azp.has_value()) {
+ static_scaled_int8_quant_impl(
+ input.data_ptr(), out.data_ptr(),
+ scale.data_ptr(), azp->data_ptr(), num_tokens,
+ stride, hidden_size);
+ } else {
+ static_scaled_int8_quant_impl(input.data_ptr(),
+ out.data_ptr(),
+ scale.data_ptr(), nullptr,
+ num_tokens, stride, hidden_size);
+ }
+ });
+}
+
+// dynamic-per-token quantization.
+void dynamic_scaled_int8_quant(
+ torch::Tensor& out, // [batch, hidden_size]
+ const torch::Tensor& input, // [batch, hidden_size]
+ torch::Tensor& scale, // [batch, 1]
+ std::optional const& azp) {
+ CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
+ TORCH_CHECK(out.is_contiguous());
+ TORCH_CHECK_EQ(input.dim(), 2);
+ TORCH_CHECK_EQ(input.stride(1), 1);
+
+ const int64_t hidden_size = input.size(1);
+ const int64_t num_tokens = input.size(0);
+ const int64_t stride = input.stride(0);
+ VLLM_DISPATCH_FLOATING_TYPES(
+ input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
+ if (azp.has_value()) {
+ dynamic_scaled_int8_quant_impl(
+ input.data_ptr(), out.data_ptr(),
+ scale.data_ptr(), azp->data_ptr(), num_tokens,
+ stride, hidden_size);
+ } else {
+ dynamic_scaled_int8_quant_impl(
+ input.data_ptr(), out.data_ptr(),
+ scale.data_ptr(), nullptr, num_tokens, stride,
+ hidden_size);
+ }
+ });
+}
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
deleted file mode 100644
index 6e120b8d20a7ee1e1f9ca8018281c1b6fdf68339..0000000000000000000000000000000000000000
--- a/csrc/cpu/quant.cpp
+++ /dev/null
@@ -1,951 +0,0 @@
-#include "cpu_types.hpp"
-#include "dnnl_helper.hpp"
-
-namespace {
-template
-struct KernelVecType {
- using load_vec_type = void;
- using azp_adj_load_vec_type = void;
- using cvt_vec_type = void;
-};
-
-template <>
-struct KernelVecType {
- using load_vec_type = vec_op::FP32Vec16;
- using azp_adj_load_vec_type = vec_op::INT32Vec16;
- using cvt_vec_type = vec_op::FP32Vec16;
-};
-
-#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
-template <>
-struct KernelVecType {
- using load_vec_type = vec_op::BF16Vec16;
- using azp_adj_load_vec_type = vec_op::INT32Vec16;
- using cvt_vec_type = vec_op::FP32Vec16;
-};
-#endif
-
-template <>
-struct KernelVecType {
-#if defined(__powerpc64__) || defined(__s390x__)
- // Power architecture-specific vector type
- using load_vec_type = vec_op::FP32Vec16;
-#else
- // Fallback for other architectures
- using load_vec_type = vec_op::FP16Vec16;
-#endif
- using azp_adj_load_vec_type = vec_op::INT32Vec16;
- using cvt_vec_type = vec_op::FP32Vec16;
-};
-
-#if defined(__AVX512F__) || defined(__aarch64__)
-template
-void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
- const float* scale, const int32_t* azp,
- const int num_tokens,
- const int hidden_size) {
- using load_vec_t = typename KernelVecType::load_vec_type;
- using cvt_vec_t = typename KernelVecType::cvt_vec_type;
- constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
- constexpr float i8_min =
- static_cast(std::numeric_limits::min());
- constexpr float i8_max =
- static_cast(std::numeric_limits::max());
- const cvt_vec_t inv_scale(1.0 / *scale);
- const cvt_vec_t i8_min_vec(i8_min);
- const cvt_vec_t i8_max_vec(i8_max);
-
- cvt_vec_t zp_vec;
- if constexpr (AZP) {
- zp_vec = cvt_vec_t(static_cast(*azp));
- }
-
- #pragma omp parallel for
- for (int i = 0; i < num_tokens; ++i) {
- int j = 0;
- for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
- load_vec_t elems(input + i * hidden_size + j);
- cvt_vec_t elems_fp32(elems);
- elems_fp32 = elems_fp32 * inv_scale;
-
- if constexpr (AZP) {
- elems_fp32 = elems_fp32 + zp_vec;
- }
-
- elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
- vec_op::INT8Vec16 elems_int8(elems_fp32);
- elems_int8.save(output + i * hidden_size + j);
- }
-
- load_vec_t elems(input + i * hidden_size + j);
- cvt_vec_t elems_fp32(elems);
- elems_fp32 = elems_fp32 * inv_scale;
-
- if constexpr (AZP) {
- elems_fp32 = elems_fp32 + zp_vec;
- }
-
- elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
- vec_op::INT8Vec16 elems_int8(elems_fp32);
- elems_int8.save(output + i * hidden_size + j, hidden_size - j);
- }
-}
-
-template
-void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
- float* scale, int32_t* azp,
- const int num_tokens,
- const int hidden_size) {
- using load_vec_t = typename KernelVecType::load_vec_type;
- using cvt_vec_t = typename KernelVecType::cvt_vec_type;
- constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
- constexpr float i8_min =
- static_cast(std::numeric_limits::min());
- constexpr float i8_max =
- static_cast(std::numeric_limits::max());
- const cvt_vec_t i8_min_vec(i8_min);
- const cvt_vec_t i8_max_vec(i8_max);
-
- #pragma omp parallel for
- for (int i = 0; i < num_tokens; ++i) {
- cvt_vec_t max_value(std::numeric_limits::lowest());
- cvt_vec_t min_value(std::numeric_limits::max());
- {
- int j = 0;
- for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
- load_vec_t elems(input + i * hidden_size + j);
- cvt_vec_t elems_fp32(elems);
- if constexpr (AZP) {
- max_value = max_value.max(elems_fp32);
- min_value = min_value.min(elems_fp32);
- } else {
- max_value = max_value.max(elems_fp32.abs());
- }
- }
-
- load_vec_t elems(input + i * hidden_size + j);
- cvt_vec_t elems_fp32(elems);
-
- if (j + vec_elem_num == hidden_size) {
- if constexpr (AZP) {
- max_value = max_value.max(elems_fp32);
- min_value = min_value.min(elems_fp32);
- } else {
- max_value = max_value.max(elems_fp32.abs());
- }
- } else {
- if constexpr (AZP) {
- max_value = max_value.max(elems_fp32, hidden_size - j);
- min_value = min_value.min(elems_fp32, hidden_size - j);
- } else {
- max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
- }
- }
- }
-
- float scale_val, azp_val;
- if constexpr (AZP) {
- float max_scalar = max_value.reduce_max();
- float min_scalar = min_value.reduce_min();
- scale_val = (max_scalar - min_scalar) / 255.0f;
- azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
- azp[i] = static_cast(azp_val);
- scale[i] = scale_val;
- } else {
- scale_val = max_value.reduce_max() / 127.0f;
- scale[i] = scale_val;
- }
-
- const cvt_vec_t inv_scale(1.0 / scale_val);
- const cvt_vec_t azp_vec(azp_val);
-
- {
- int j = 0;
- for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
- load_vec_t elems(input + i * hidden_size + j);
- cvt_vec_t elems_fp32(elems);
- elems_fp32 = (elems_fp32 * inv_scale);
-
- if constexpr (AZP) {
- elems_fp32 = elems_fp32 + azp_vec;
- }
- elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
- vec_op::INT8Vec16 elems_int8(elems_fp32);
- elems_int8.save(output + i * hidden_size + j);
- }
-
- load_vec_t elems(input + i * hidden_size + j);
- cvt_vec_t elems_fp32(elems);
- elems_fp32 = (elems_fp32 * inv_scale);
-
- if constexpr (AZP) {
- elems_fp32 = elems_fp32 + azp_vec;
- }
- elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
- vec_op::INT8Vec16 elems_int8(elems_fp32);
- elems_int8.save(output + i * hidden_size + j, hidden_size - j);
- }
- }
-}
-
-template
-void static_quant_epilogue(const float* input, scalar_t* output,
- const float a_scale, const float* b_scale,
- const int32_t* azp_with_adj, const int num_tokens,
- const int hidden_size) {
- CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
- using load_vec_t = typename KernelVecType::load_vec_type;
- using azp_adj_load_vec_t =
- typename KernelVecType::azp_adj_load_vec_type;
- using cvt_vec_t = typename KernelVecType::cvt_vec_type;
- constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
- #pragma omp parallel for
- for (int i = 0; i < num_tokens; ++i) {
- cvt_vec_t a_scale_vec(a_scale);
- cvt_vec_t b_scale_vec(*b_scale);
- cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
-
- int j = 0;
- for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
- cvt_vec_t elems_fp32(input + i * hidden_size + j);
- azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
- cvt_vec_t azp_adj_fp32(azp_adj_vec);
-
- if constexpr (PerChannel) {
- b_scale_vec = cvt_vec_t(b_scale + j);
- scale_vec = b_scale_vec * a_scale_vec;
- }
-
- elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
-
- load_vec_t elems_out(elems_fp32);
- elems_out.save(output + i * hidden_size + j);
- }
-
- cvt_vec_t elems_fp32(input + i * hidden_size + j);
- azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
- cvt_vec_t azp_adj_fp32(azp_adj_vec);
-
- if constexpr (PerChannel) {
- b_scale_vec = cvt_vec_t(b_scale + j);
- scale_vec = b_scale_vec * a_scale_vec;
- }
-
- elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
-
- load_vec_t elems_out(elems_fp32);
- elems_out.save(output + i * hidden_size + j, hidden_size - j);
- }
-}
-
-template
-void dynamic_quant_epilogue(const float* input, scalar_t* output,
- const float* a_scale, const float* b_scale,
- const int32_t* azp, const int32_t* azp_adj,
- const scalar_t* bias, const int num_tokens,
- const int hidden_size) {
- CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
- using load_vec_t = typename KernelVecType::load_vec_type;
- using azp_adj_load_vec_t =
- typename KernelVecType::azp_adj_load_vec_type;
- using cvt_vec_t = typename KernelVecType::cvt_vec_type;
- constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
- #pragma omp parallel for
- for (int i = 0; i < num_tokens; ++i) {
- int j = 0;
- cvt_vec_t token_scale_vec(a_scale[i]);
- cvt_vec_t token_zp_scale_vec;
- if constexpr (AZP) {
- float zp_scale_val = a_scale[i] * static_cast(azp[i]);
- if constexpr (!PerChannel) {
- zp_scale_val *= *b_scale;
- }
- token_zp_scale_vec = cvt_vec_t(zp_scale_val);
- }
-
- for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
- cvt_vec_t elems_fp32(input + i * hidden_size + j);
- elems_fp32 = elems_fp32 * token_scale_vec;
-
- if constexpr (AZP) {
- azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
- cvt_vec_t azp_adj_fp32(azp_adj_vec);
- azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
-
- if constexpr (PerChannel) {
- cvt_vec_t b_scale_vec(b_scale + j);
- azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
- }
-
- elems_fp32 = elems_fp32 - azp_adj_fp32;
- }
-
- if constexpr (Bias) {
- load_vec_t bias_vec(bias + j);
- cvt_vec_t bias_vec_fp32(bias_vec);
- elems_fp32 = elems_fp32 + bias_vec_fp32;
- }
-
- load_vec_t elems_out(elems_fp32);
- elems_out.save(output + i * hidden_size + j);
- }
-
- cvt_vec_t elems_fp32(input + i * hidden_size + j);
- elems_fp32 = elems_fp32 * token_scale_vec;
-
- if constexpr (AZP) {
- azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
- cvt_vec_t azp_adj_fp32(azp_adj_vec);
- azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
-
- if constexpr (PerChannel) {
- cvt_vec_t b_scale_vec(b_scale + j);
- azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
- }
-
- elems_fp32 = elems_fp32 - azp_adj_fp32;
- }
-
- if constexpr (Bias) {
- load_vec_t bias_vec(bias + j);
- cvt_vec_t bias_vec_fp32(bias_vec);
- elems_fp32 = elems_fp32 + bias_vec_fp32;
- }
-
- load_vec_t elems_out(elems_fp32);
- elems_out.save(output + i * hidden_size + j, hidden_size - j);
- }
-}
-#elif defined(__powerpc64__)
-template
-void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
- const float* scale, const int32_t* azp,
- const int num_tokens,
- const int hidden_size) {
- using load_vec_t = typename KernelVecType::load_vec_type;
- using cvt_vec_t = typename KernelVecType::cvt_vec_type;
- constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
- constexpr float i8_min =
- static_cast(std::numeric_limits::min());
- constexpr float i8_max =
- static_cast(std::numeric_limits::max());
-
- const cvt_vec_t inv_scale(1.0 / *scale);
- const cvt_vec_t i8_min_vec(i8_min);
- const cvt_vec_t i8_max_vec(i8_max);
-
- cvt_vec_t zp_vec;
- if constexpr (AZP) {
- zp_vec = cvt_vec_t(static_cast(*azp));
- }
- #pragma omp parallel for
- for (int i = 0; i < num_tokens; ++i) {
- int j = 0;
- for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
- load_vec_t elems(input + i * hidden_size + j);
- cvt_vec_t elems_fp32(elems);
- elems_fp32 = elems_fp32 * inv_scale;
- if constexpr (AZP) {
- elems_fp32 = elems_fp32 + zp_vec;
- }
- elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
- vec_op::INT8Vec16 elems_int8(elems_fp32);
- elems_int8.save(output + i * hidden_size + j);
- }
- load_vec_t elems(input + i * hidden_size + j);
- cvt_vec_t elems_fp32(elems);
- elems_fp32 = elems_fp32 * inv_scale;
-
- if constexpr (AZP) {
- elems_fp32 = elems_fp32 + zp_vec;
- }
-
- elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
- vec_op::INT8Vec16 elems_int8(elems_fp32);
- elems_int8.save(output + i * hidden_size + j, hidden_size - j);
- }
-}
-template
-void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
- float* scale, int32_t* azp,
- const int num_tokens,
- const int hidden_size) {
- using load_vec_t = typename KernelVecType::load_vec_type;
- using cvt_vec_t = typename KernelVecType::cvt_vec_type;
- constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
- constexpr float i8_min =
- static_cast(std::numeric_limits::min());
- constexpr float i8_max =
- static_cast(std::numeric_limits::max());
- const cvt_vec_t i8_min_vec(i8_min);
- const cvt_vec_t i8_max_vec(i8_max);
-
- #pragma omp parallel for
- for (int i = 0; i < num_tokens; ++i) {
- cvt_vec_t max_value(std::numeric_limits::lowest());
- cvt_vec_t min_value(std::numeric_limits::max());
- {
- int j = 0;
- for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
- load_vec_t elems(input + i * hidden_size + j);
- cvt_vec_t elems_fp32(elems);
- if constexpr (AZP) {
- max_value = max_value.max(elems_fp32);
- min_value = min_value.min(elems_fp32);
- } else {
- max_value = max_value.max(elems_fp32.abs());
- }
- }
-
- load_vec_t elems(input + i * hidden_size + j);
- cvt_vec_t elems_fp32(elems);
-
- if (j + vec_elem_num == hidden_size) {
- if constexpr (AZP) {
- max_value = max_value.max(elems_fp32);
- min_value = min_value.min(elems_fp32);
- } else {
- max_value = max_value.max(elems_fp32.abs());
- }
- } else {
- if constexpr (AZP) {
- max_value = max_value.max(elems_fp32, hidden_size - j);
- min_value = min_value.min(elems_fp32, hidden_size - j);
- } else {
- max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
- }
- }
- }
-
- float scale_val, azp_val;
- if constexpr (AZP) {
- float max_scalar = max_value.reduce_max();
- float min_scalar = min_value.reduce_min();
- scale_val = (max_scalar - min_scalar) / 255.0f;
- azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
- azp[i] = static_cast(azp_val);
- scale[i] = scale_val;
- } else {
- scale_val = max_value.reduce_max() / 127.0f;
- scale[i] = scale_val;
- }
-
- const cvt_vec_t inv_scale(1.0 / scale_val);
- const cvt_vec_t azp_vec(azp_val);
-
- {
- int j = 0;
- for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
- load_vec_t elems(input + i * hidden_size + j);
- cvt_vec_t elems_fp32(elems);
- elems_fp32 = (elems_fp32 * inv_scale);
-
- if constexpr (AZP) {
- elems_fp32 = elems_fp32 + azp_vec;
- }
- elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
- vec_op::INT8Vec16 elems_int8(elems_fp32);
- elems_int8.save(output + i * hidden_size + j);
- }
-
- load_vec_t elems(input + i * hidden_size + j);
- cvt_vec_t elems_fp32(elems);
- elems_fp32 = (elems_fp32 * inv_scale);
-
- if constexpr (AZP) {
- elems_fp32 = elems_fp32 + azp_vec;
- }
- elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
- vec_op::INT8Vec16 elems_int8(elems_fp32);
- elems_int8.save(output + i * hidden_size + j, hidden_size - j);
- }
- }
-}
-template
-void static_quant_epilogue(const float* input, scalar_t* output,
- const float a_scale, const float* b_scale,
- const int32_t* azp_with_adj, const int num_tokens,
- const int hidden_size) {
- CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
- using load_vec_t = typename KernelVecType::load_vec_type;
- using azp_adj_load_vec_t =
- typename KernelVecType::azp_adj_load_vec_type;
- using cvt_vec_t = typename KernelVecType::cvt_vec_type;
- constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
-
- #pragma omp parallel for
- for (int i = 0; i < num_tokens; ++i) {
- cvt_vec_t a_scale_vec(a_scale);
- cvt_vec_t b_scale_vec(*b_scale);
- cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
-
- int j = 0;
- for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
- cvt_vec_t elems_fp32(input + i * hidden_size + j);
- azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
- cvt_vec_t azp_adj_fp32(azp_adj_vec);
-
- if constexpr (PerChannel) {
- b_scale_vec = cvt_vec_t(b_scale + j);
- scale_vec = b_scale_vec * a_scale_vec;
- }
- elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
- load_vec_t elems_out(elems_fp32);
- elems_out.save(output + i * hidden_size + j);
- }
-
- cvt_vec_t elems_fp32(input + i * hidden_size + j);
- azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
- cvt_vec_t azp_adj_fp32(azp_adj_vec);
-
- if constexpr (PerChannel) {
- b_scale_vec = cvt_vec_t(b_scale + j);
- scale_vec = b_scale_vec * a_scale_vec;
- }
-
- elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
-
- load_vec_t elems_out(elems_fp32);
- elems_out.save(output + i * hidden_size + j, hidden_size - j);
- }
-}
-template
-void dynamic_quant_epilogue(const float* input, scalar_t* output,
- const float* a_scale, const float* b_scale,
- const int32_t* azp, const int32_t* azp_adj,
- const scalar_t* bias, const int num_tokens,
- const int hidden_size) {
- CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
- using load_vec_t = typename KernelVecType::load_vec_type;
- using azp_adj_load_vec_t =
- typename KernelVecType