diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 76f6d7aeca0d86f128dc820005166044525547eb..77ee313687fc8a8a1388f90475566fbaea276bba 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -5,11 +5,11 @@ import os import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB # Note that we have 800 MiB quota, please use it wisely. # See https://github.com/pypi/support/issues/6326 . # Please also sync the value with the one in Dockerfile. -VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450)) +VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500)) def print_top_10_largest_files(zip_file): diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml similarity index 59% rename from .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml rename to .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml index 1bce7e7fdf146fe6e19f4e5191a0b310364535fa..56ec933c9cc0e5e1fc8041db7f485fa272575d20 100644 --- a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml @@ -1,12 +1,12 @@ # For vllm script, with -t option (tensor parallel size). -# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1 -model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise" +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 +model_name: "HandH1998/QQQ-Llama-3-8b-g128" tasks: - name: "gsm8k" metrics: - name: "exact_match,strict-match" - value: 0.595 + value: 0.419 - name: "exact_match,flexible-extract" - value: 0.582 + value: 0.416 limit: 1000 num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ccb4f84201b77437200ae69a0d5a28869a0dfe01 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml @@ -0,0 +1,12 @@ +# For hf script, without -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8 +model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" +backend: "vllm-vlm" +tasks: +- name: "chartqa" + metrics: + - name: "relaxed_accuracy,none" + # TODO(zhewenl): model card is 0.90, but the actual score is 0.80. + value: 0.80 +limit: 100 +num_fewshot: 0 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml new file mode 100644 index 0000000000000000000000000000000000000000..46f1a9fbf6ff95465e75b47ee23fcaf077fa8852 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml @@ -0,0 +1,10 @@ +# For hf script, without -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5 +model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" +tasks: +- name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.80 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml index a2f235f485815848db1106809d714fd519f24e60..aa4fb9fa03d6d7baa0868bdacb650a82e7e25200 100644 --- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml +++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml @@ -1,4 +1,5 @@ -# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1 +# For vllm script, with -t option (tensor parallel size) +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" tasks: - name: "gsm8k" diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5f3c31743e75ba0298e75489300a0caa96995dde --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -0,0 +1,12 @@ +# For vllm script, with -t option (tensor parallel size). +# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1 + +model_name: "Qwen/Qwen2.5-VL-7B-Instruct" +backend: "vllm-vlm" +tasks: +- name: "chartqa" + metrics: + - name: "relaxed_accuracy,none" + value: 0.855 +limit: 2500 +num_fewshot: 0 diff --git a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml new file mode 100644 index 0000000000000000000000000000000000000000..514c15d6098ed767e2338886b7a9887ee43c56ee --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml @@ -0,0 +1,14 @@ +model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8" +tasks: + - name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.82 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +num_fewshot: 5 +enforce_eager: false # we use false to speed up the eval process +kv_cache_dtype: fp8 # we use fp8 to speed up the eval process +max_model_len: 40960 +apply_chat_template: true +fewshot_as_multiturn: true +gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>" diff --git a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt new file mode 100644 index 0000000000000000000000000000000000000000..5552391d9eababaf498e82b245ff297cd0c65e68 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt @@ -0,0 +1 @@ +Qwen3-235B-A22B-Instruct-2507-FP8.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt new file mode 100644 index 0000000000000000000000000000000000000000..91e22b6459c1235d6ed7877618423caa7dc297af --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt @@ -0,0 +1 @@ +Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-mm-small.txt b/.buildkite/lm-eval-harness/configs/models-mm-small.txt new file mode 100644 index 0000000000000000000000000000000000000000..1097d220245fc42a9ea77bcde11e5b410614b3eb --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt @@ -0,0 +1 @@ +Qwen2.5-VL-7B-Instruct.yaml \ No newline at end of file diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh new file mode 100755 index 0000000000000000000000000000000000000000..c8db951381b0bd8b4c36ffe0b97c2155aea5c52b --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on chartqa for vllm. +# +# Make sure you have lm-eval-harness installed: +# pip install lm-eval==0.4.9 + +usage() { + echo`` + echo "Runs lm eval harness on ChartQA using multimodal vllm." + echo "This pathway is intended to be used to create baselines for " + echo "our correctness tests in vllm's CI." + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -l - limit number of samples to run" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:l:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm-vlm \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \ + --tasks chartqa \ + --batch_size auto \ + --apply_chat_template \ + --limit $LIMIT diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh old mode 100644 new mode 100755 diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh new file mode 100644 index 0000000000000000000000000000000000000000..d85a1721db9a59d46ab9a7fdaf52b68c8dc13186 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on MMLUPRO for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] + +usage() { + echo`` + echo "Runs lm eval harness on MMLU Pro using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ + --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size auto diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index ceea01166b7f414c36e11e0d7007bca7b580b740..3627b760eddcf769538db2f8e6ba94746847bb08 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -19,21 +19,35 @@ RTOL = 0.08 def launch_lm_eval(eval_config, tp_size): trust_remote_code = eval_config.get("trust_remote_code", False) max_model_len = eval_config.get("max_model_len", 4096) + batch_size = eval_config.get("batch_size", "auto") + backend = eval_config.get("backend", "vllm") + enforce_eager = eval_config.get("enforce_eager", "true") + kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto") model_args = ( f"pretrained={eval_config['model_name']}," f"tensor_parallel_size={tp_size}," - f"enforce_eager=true," + f"enforce_eager={enforce_eager}," + f"kv_cache_dtype={kv_cache_dtype}," f"add_bos_token=true," f"trust_remote_code={trust_remote_code}," - f"max_model_len={max_model_len}" + f"max_model_len={max_model_len}," ) results = lm_eval.simple_evaluate( - model="vllm", + model=backend, model_args=model_args, tasks=[task["name"] for task in eval_config["tasks"]], num_fewshot=eval_config["num_fewshot"], limit=eval_config["limit"], - batch_size="auto", + # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help + # text models. however, this is regressing measured strict-match for + # existing text models in CI, so only apply it for mm, or explicitly set + apply_chat_template=eval_config.get( + "apply_chat_template", backend == "vllm-vlm" + ), + fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), + # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) + gen_kwargs=eval_config.get("gen_kwargs"), + batch_size=batch_size, ) return results diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml deleted file mode 100644 index 4259514940d3fc7b4e3d3062047267f1a40a1b1a..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ /dev/null @@ -1,184 +0,0 @@ -steps: - - label: "Wait for container to be ready" - key: wait-for-container-image - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - containers: - - image: badouralix/curl-jq - command: - - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh - - label: "Cleanup H100" - agents: - queue: H100 - depends_on: ~ - command: docker system prune -a --volumes --force - - - label: "A100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: A100 - depends_on: wait-for-container-image - if: build.branch == "main" - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT - command: - - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - - label: "H200" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H200 - depends_on: wait-for-container-image - if: build.branch == "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: 4,5,6,7 - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN - - #- block: "Run H100 Benchmark" - #key: block-h100 - #depends_on: ~ - - - label: "H100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H100 - depends_on: wait-for-container-image - if: build.branch == "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN - - # Premerge benchmark - - label: "A100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: A100 - depends_on: wait-for-container-image - if: build.branch != "main" - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - command: - - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - - label: "H200" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H200 - depends_on: wait-for-container-image - if: build.branch != "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: 4,5,6,7 - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN - - #- block: "Run H100 Benchmark" - #key: block-h100 - #depends_on: ~ - - - label: "H100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H100 - depends_on: wait-for-container-image - if: build.branch != "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md deleted file mode 100644 index 466def07b6f1f676607c5f47391530ae7454afae..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/nightly-annotation.md +++ /dev/null @@ -1,28 +0,0 @@ -# Nightly benchmark annotation - -## Description - -This file contains the downloading link for benchmarking results. - -- [benchmarking pipeline](artifact://nightly-pipeline.yaml) -- [benchmarking results](artifact://results.zip) -- [benchmarking code](artifact://nightly-benchmarks.zip) - -Please download the visualization scripts in the post - -## Results reproduction - -- Find the docker we use in `benchmarking pipeline` -- Deploy the docker, and inside the docker: - - Download `nightly-benchmarks.zip`. - - In the same folder, run the following code: - - ```bash - export HF_TOKEN= - apt update - apt install -y git - unzip nightly-benchmarks.zip - VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh - ``` - -And the results will be inside `./benchmarks/results`. diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md deleted file mode 100644 index 2ef36089b6afb88a222c286d62e5dd9ed8d2872d..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ /dev/null @@ -1,39 +0,0 @@ - -# Nightly benchmark - -This benchmark aims to: - -- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload. -- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions. - -Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end. - -Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176) - -## Setup - -- Docker images: - - vLLM: `vllm/vllm-openai:v0.6.2` - - SGLang: `lmsysorg/sglang:v0.3.2-cu121` - - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` - - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` - - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.* - - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. -- Hardware - - 8x Nvidia A100 GPUs -- Workload: - - Dataset - - ShareGPT dataset - - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output) - - Decode-heavy dataset (in average 462 input tokens, 256 output tokens) - - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use. - - Models: llama-3 8B, llama-3 70B. - - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)). - - Average QPS (query per second): 2, 4, 8, 16, 32 and inf. - - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. - - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). - -## Known issues - -- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105). -- TGI does not support `ignore-eos` flag. diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml deleted file mode 100644 index 199517e8b067c05df6375a7bcbc35e5a93ddb838..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ /dev/null @@ -1,196 +0,0 @@ -common_pod_spec: &common_pod_spec - priorityClassName: perf-benchmark - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory - -common_container_settings: &common_container_settings - command: - - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - -steps: - - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - - - - - label: "A100 vllm step 10" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: vllm/vllm-openai:v0.6.2 - <<: *common_container_settings - - - - - label: "A100 sglang benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: lmsysorg/sglang:v0.3.2-cu121 - <<: *common_container_settings - - - label: "A100 lmdeploy benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: openmmlab/lmdeploy:v0.6.1-cu12 - <<: *common_container_settings - - - - - - label: "A100 trt llama-8B" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 - <<: *common_container_settings - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - - name: TEST_SELECTOR - value: "llama8B" - - - - label: "A100 trt llama-70B" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 - <<: *common_container_settings - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - - name: TEST_SELECTOR - value: "llama70B" - - - # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image - # - label: "A100 trt benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # <<: *common_pod_spec - # containers: - # - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 - # <<: *common_container_settings - - - # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`. - # - label: "A100 tgi benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # <<: *common_pod_spec - # containers: - # - image: ghcr.io/huggingface/text-generation-inference:2.2.0 - # <<: *common_container_settings - - - wait - - - label: "Collect the results" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: vllm/vllm-openai:v0.5.0.post1 - command: - - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - - - block: ":rocket: check the results!" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py deleted file mode 100644 index 8532ff7ef798cc039926ef8781fccab71f5e17a9..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse - -from transformers import AutoTokenizer - - -def main(model, cachedir): - # Load the tokenizer and save it to the specified directory - tokenizer = AutoTokenizer.from_pretrained(model) - tokenizer.save_pretrained(cachedir) - print(f"Tokenizer saved to {cachedir}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Download and save Hugging Face tokenizer" - ) - parser.add_argument("--model", type=str, required=True, help="Name of the model") - parser.add_argument( - "--cachedir", type=str, required=True, help="Directory to save the tokenizer" - ) - - args = parser.parse_args() - main(args.model, args.cachedir) diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py deleted file mode 100644 index 053fd52c35ae906710cbe08579becf0eafc72c4b..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ /dev/null @@ -1,97 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import json -from pathlib import Path - -import numpy as np -import pandas as pd -from tabulate import tabulate - - -def parse_arguments(): - parser = argparse.ArgumentParser( - description="Parse command line arguments for summary-nightly-results script." - ) - parser.add_argument( - "--results-folder", - type=str, - required=True, - help="The folder where the results are stored.", - ) - parser.add_argument( - "--description", type=str, required=True, help="Description of the results." - ) - - args = parser.parse_args() - return args - - -def get_perf(df, method, model, metric): - means = [] - - for qps in [2, 4, 8, 16, "inf"]: - target = df["Test name"].str.contains(model) - target = target & df["Engine"].str.contains(method) - target = target & df["Test name"].str.contains("qps_" + str(qps)) - filtered_df = df[target] - - if filtered_df.empty: - means.append(0.0) - else: - means.append(filtered_df[metric].values[0]) - - return np.array(means) - - -def get_perf_w_std(df, method, model, metric): - if metric in ["TTFT", "ITL"]: - mean = get_perf(df, method, model, "Mean " + metric + " (ms)") - mean = mean.tolist() - std = get_perf(df, method, model, "Std " + metric + " (ms)") - if std.mean() == 0: - std = None - success = get_perf(df, method, model, "Successful req.") - if std is not None: - std = std / np.sqrt(success) - std = std.tolist() - - else: - assert metric == "Tput" - mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( - df, method, model, "Output Tput (tok/s)" - ) - mean = mean.tolist() - std = None - - return mean, std - - -def main(args): - results_folder = Path(args.results_folder) - - results = [] - - # collect results - for test_file in results_folder.glob("*_nightly_results.json"): - with open(test_file) as f: - results = results + json.loads(f.read()) - - # generate markdown table - df = pd.DataFrame.from_dict(results) - - md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False) - - with open(args.description) as f: - description = f.read() - - description = description.format(nightly_results_benchmarking_table=md_table) - - with open("nightly_results.md", "w") as f: - f.write(description) - - -if __name__ == "__main__": - args = parse_arguments() - main(args) diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py deleted file mode 100644 index ddea1d2b1b1ed5710f5f878c663e83f388a62b73..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from lmdeploy.serve.openai.api_client import APIClient - -api_client = APIClient("http://localhost:8000") -model_name = api_client.available_models[0] - -print(model_name) diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh deleted file mode 100644 index 69b6b146b3549bee8d6c3fb336c306de0cf9de55..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -set -ex -set -o pipefail - - -main() { - - (which wget && which curl) || (apt-get update && apt-get install -y wget curl) - (which jq) || (apt-get update && apt-get -y install jq) - (which zip) || (apt-get install -y zip) - - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip plotting the results." - exit 0 - fi - - # initial annotation - #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" - - # download results - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - mkdir -p results/ - /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/ - ls - ls results/ - - # upload benchmark results - zip -r results.zip results/ - /workspace/buildkite-agent artifact upload "results.zip" - - # upload benchmarking scripts - cd "$VLLM_SOURCE_CODE_LOC/" - zip -r nightly-benchmarks.zip .buildkite/ benchmarks/ - /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip" - - cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" - # upload benchmarking pipeline - /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml" - - cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" - /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md - - - - # The figures should be generated by a separate process outside the CI/CD pipeline - - # # generate figures - # python3 -m pip install tabulate pandas matplotlib - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \ - # --description $description \ - # --results-folder results/ - - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - # --description $description \ - # --results-folder results/ \ - # --dataset sharegpt - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - # --description $description \ - # --results-folder results/ \ - # --dataset sonnet_2048_128 - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - # --description $description \ - # --results-folder results/ \ - # --dataset sonnet_128_2048 - - # # upload results and figures - # /workspace/buildkite-agent artifact upload "nightly_results*.png" - # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml - # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json - # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh deleted file mode 100644 index a00de940cbbb81bff911996a7ec56d0c83c8470d..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ /dev/null @@ -1,464 +0,0 @@ -#!/bin/bash - -set -o pipefail -set -x - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')" - echo "GPU type is $gpu_type" -} - -check_hf_token() { - # check if HF_TOKEN is available and valid - if [[ -z "$HF_TOKEN" ]]; then - echo "Error: HF_TOKEN is not set." - exit 1 - elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then - echo "Error: HF_TOKEN does not start with 'hf_'." - exit 1 - else - echo "HF_TOKEN is set and valid." - fi -} - - -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md - /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" -} - - -get_current_llm_serving_engine() { - - if which lmdeploy >/dev/null; then - echo "Container: lmdeploy" - export CURRENT_LLM_SERVING_ENGINE=lmdeploy - return - fi - - if [ -e /tgi-entrypoint.sh ]; then - echo "Container: tgi" - export CURRENT_LLM_SERVING_ENGINE=tgi - return - fi - - if which trtllm-build >/dev/null; then - echo "Container: tensorrt-llm" - export CURRENT_LLM_SERVING_ENGINE=trt - return - fi - - if [ -e /sgl-workspace ]; then - echo "Container: sglang" - export CURRENT_LLM_SERVING_ENGINE=sglang - return - fi - - if [ -e /vllm-workspace ]; then - echo "Container: vllm" - # move to a completely irrelevant directory, to avoid import vllm from current folder - export CURRENT_LLM_SERVING_ENGINE=vllm - - return - fi -} - -json2args() { - # transforms the JSON string to command line args, and '_' is replaced to '-' - # example: - # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } - # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 - local json_string=$1 - local args=$( - echo "$json_string" | jq -r ' - to_entries | - map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | - join(" ") - ' - ) - echo "$args" -} - -kill_gpu_processes() { - pkill -f '[p]ython' - pkill -f '[p]ython3' - pkill -f '[t]ritonserver' - pkill -f '[p]t_main_thread' - pkill -f '[t]ext-generation' - pkill -f '[l]mdeploy' - # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 - pkill -f '[V]LLM' - - while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do - sleep 1 - done -} - -wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes - timeout 1200 bash -c ' - until curl -s localhost:8000/v1/completions > /dev/null; do - sleep 1 - done' && return 0 || return 1 -} - -ensure_installed() { - # Ensure that the given command is installed by apt-get - local cmd=$1 - if ! which "$cmd" >/dev/null; then - apt-get update && apt-get install -y "$cmd" - fi -} - -run_serving_tests() { - # run serving tests using `vllm bench serve` command - # $1: a json file specifying serving test cases - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # prepend the current serving engine to the test name - test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - reuse_server=$(echo "$common_params" | jq -r '.reuse_server') - - # get client and server arguments - server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") - client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters") - client_args=$(json2args "$client_params") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - if [[ $reuse_server == "true" ]]; then - echo "Reuse previous server for test case $test_name" - else - kill_gpu_processes - bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ - "$server_params" "$common_params" - fi - - if wait_for_server; then - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." - else - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." - break - fi - - # prepare tokenizer - # this is required for lmdeploy. - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - rm -rf /tokenizer_cache - mkdir /tokenizer_cache - python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ - --model "$model" \ - --cachedir /tokenizer_cache - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - - - # change model name for lmdeploy (it will not follow standard hf name) - if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then - model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) - fi - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps="inf" - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - - backend=$CURRENT_LLM_SERVING_ENGINE - - if [[ $backend = "trt" ]]; then - backend="tensorrt-llm" - fi - - if [[ "$backend" == *"vllm"* ]]; then - backend="vllm" - fi - - if [[ "$dataset_name" = "sharegpt" ]]; then - - client_command="vllm bench serve \ - --backend $backend \ - --tokenizer /tokenizer_cache \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --ignore-eos \ - $client_args" - - elif [[ "$dataset_name" = "sonnet" ]]; then - - sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len') - sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') - sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') - - client_command="vllm bench serve \ - --backend $backend \ - --tokenizer /tokenizer_cache \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --sonnet-input-len $sonnet_input_len \ - --sonnet-output-len $sonnet_output_len \ - --sonnet-prefix-len $sonnet_prefix_len \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --ignore-eos \ - $client_args" - - else - - echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." - exit 1 - - fi - - - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - eval "$client_command" - - server_command="None" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - --arg engine "$CURRENT_LLM_SERVING_ENGINE" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu, - engine: $engine - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - - done - - done - - kill_gpu_processes -} - -run_genai_perf_tests() { - # run genai-perf tests - - # $1: a json file specifying genai-perf test cases - local genai_perf_test_file - genai_perf_test_file=$1 - - # Iterate over genai-perf tests - jq -c '.[]' "$genai_perf_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # prepend the current serving engine to the test name - test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - reuse_server=$(echo "$common_params" | jq -r '.reuse_server') - - # get client and server arguments - server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - if [[ $reuse_server == "true" ]]; then - echo "Reuse previous server for test case $test_name" - else - kill_gpu_processes - bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ - "$server_params" "$common_params" - fi - - if wait_for_server; then - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." - else - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." - break - fi - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps=$num_prompts - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - backend=$CURRENT_LLM_SERVING_ENGINE - - if [[ "$backend" == *"vllm"* ]]; then - backend="vllm" - fi - #TODO: add output dir. - client_command="genai-perf profile \ - -m $model \ - --service-kind openai \ - --backend "$backend" \ - --endpoint-type chat \ - --streaming \ - --url localhost:$port \ - --request-rate $qps \ - --num-prompts $num_prompts \ - " - - echo "Client command: $client_command" - - eval "$client_command" - - #TODO: process/record outputs - done - done - - kill_gpu_processes - -} - -prepare_dataset() { - - # download sharegpt dataset - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - # duplicate sonnet by 4x, to allow benchmarking with input length 2048 - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - echo "" > sonnet_4x.txt - for _ in {1..4} - do - cat sonnet.txt >> sonnet_4x.txt - done - -} - -main() { - - # check if the environment variable is successfully injected from yaml - - check_gpus - check_hf_token - get_current_llm_serving_engine - - pip install -U transformers - - pip install -r requirements/dev.txt - which genai-perf - - # check storage - df -h - - ensure_installed wget - ensure_installed curl - ensure_installed jq - # genai-perf dependency - ensure_installed libb64-0d - - prepare_dataset - - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" - - # run the test - run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" - - # run genai-perf tests - run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json" - mv artifacts/ $RESULTS_FOLDER/ - - # upload benchmark results to buildkite - python3 -m pip install tabulate pandas - python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" - upload_to_buildkite - -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py deleted file mode 100644 index fb3b9d5e34e03c7708f560301a7b802f3e3906bc..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ /dev/null @@ -1,82 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import datetime -import json -import os -from pathlib import Path - -import pandas as pd -from tabulate import tabulate - -results_folder = Path("results/") - -# serving results and the keys that will be printed into markdown -serving_results = [] -serving_column_mapping = { - "test_name": "Test name", - "gpu_type": "GPU", - "completed": "Successful req.", - "request_throughput": "Tput (req/s)", - "mean_ttft_ms": "Mean TTFT (ms)", - "std_ttft_ms": "Std TTFT (ms)", - "median_ttft_ms": "Median TTFT (ms)", - "mean_itl_ms": "Mean ITL (ms)", - "std_itl_ms": "Std ITL (ms)", - "median_itl_ms": "Median ITL (ms)", - "mean_tpot_ms": "Mean TPOT (ms)", - "std_tpot_ms": "Std TPOT (ms)", - "median_tpot_ms": "Median TPOT (ms)", - "total_token_throughput": "Total Token Tput (tok/s)", - "output_throughput": "Output Tput (tok/s)", - "total_input_tokens": "Total input tokens", - "total_output_tokens": "Total output tokens", - "engine": "Engine", -} - -if __name__ == "__main__": - # collect results - for test_file in results_folder.glob("*.json"): - with open(test_file) as f: - raw_result = json.loads(f.read()) - - # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands")) as f: - command = json.loads(f.read()) - raw_result.update(command) - - # update the test name of this result - raw_result.update({"test_name": test_file.stem}) - - # add the result to raw_result - serving_results.append(raw_result) - continue - - serving_results = pd.DataFrame.from_dict(serving_results) - - if not serving_results.empty: - serving_results = serving_results[list(serving_column_mapping.keys())].rename( - columns=serving_column_mapping - ) - - serving_md_table_with_headers = tabulate( - serving_results, headers="keys", tablefmt="pipe", showindex=False - ) - # remove the first line of header - serving_md_table_lines = serving_md_table_with_headers.split("\n") - serving_md_table_without_header = "\n".join(serving_md_table_lines[2:]) - - prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE") - - # document benchmarking results in markdown - with open(results_folder / f"{prefix}_nightly_results.md", "w") as f: - # document results with header. - # for those who wants to reproduce our benchmark. - f.write(serving_md_table_with_headers) - f.write("\n") - - # document benchmarking results in json - with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: - results = serving_results.to_dict(orient="records") - f.write(json.dumps(results)) diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh deleted file mode 100644 index 50e1ab0242202854c8a978b9133c8de79a98a18b..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/sh -TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token) -if [[ "$BUILDKITE_BRANCH" == "main" ]]; then - URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" -else - URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" -fi - -TIMEOUT_SECONDS=10 - -retries=0 -while [ $retries -lt 1000 ]; do - if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then - exit 0 - fi - - echo "Waiting for image to be available..." - - retries=$((retries + 1)) - sleep 5 -done - -exit 1 diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json deleted file mode 100644 index 569117aae852d0c78c64a71a610678584ed62d19..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json +++ /dev/null @@ -1,30 +0,0 @@ -[ - { - "test_name": "latency_llama8B_tp1", - "environment_variables": { - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "load_format": "dummy", - "num_iters_warmup": 5, - "num_iters": 15 - } - }, - { - "test_name": "latency_llama8B_tp4", - "environment_variables": { - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "load_format": "dummy", - "num_iters_warmup": 5, - "num_iters": 15 - } - } -] diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json deleted file mode 100644 index 48c015aa8403b1bec6e7a014bc6317feee3e56ad..0000000000000000000000000000000000000000 --- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json +++ /dev/null @@ -1,32 +0,0 @@ -[ - { - "test_name": "throughput_llama8B_tp1", - "environment_variables": { - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - }, - { - "test_name": "throughput_llama8B_tp4", - "environment_variables": { - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - } -] diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md similarity index 69% rename from .buildkite/nightly-benchmarks/README.md rename to .buildkite/performance-benchmarks/README.md index e6f5c8b60f459a1d25d45493807e407ffca5bab3..6d494f64f14fa18086391c35975641b612a3d700 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -2,40 +2,23 @@ ## Introduction -This directory contains two sets of benchmark for vllm. - -- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance -- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. - -See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. +This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance. +vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD. ## Performance benchmark quick overview -**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models. +**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models. **Benchmarking Duration**: about 1hr. **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run. -## Nightly benchmark quick overview - -**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. - -**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy. - -**Benchmarking Duration**: about 3.5hrs. - ## Trigger the benchmark -Performance benchmark will be triggered when: - -- A PR being merged into vllm. -- Every commit for those PRs with `perf-benchmarks` label AND `ready` label. - -Manually Trigger the benchmark +The benchmark needs to be triggered manually: ```bash -bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh ``` Runtime environment variables: @@ -47,14 +30,11 @@ Runtime environment variables: - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. -Nightly benchmark will be triggered when: - -- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label. - ## Performance benchmark details See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. +For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead. > ### Latency test @@ -152,26 +132,3 @@ Here is an example using the script to compare result_a and result_b with Model, A comparison diagram will be generated below the table. Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3 image - -## Nightly test details - -See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines. - -### Workflow - -- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. -- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container. -- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`. -- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. - -### Nightly tests - -In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark. - -### Docker containers - -The docker containers for benchmarking are specified in `nightly-pipeline.yaml`. - -WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`. - -WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git). diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md similarity index 92% rename from .buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md rename to .buildkite/performance-benchmarks/performance-benchmarks-descriptions.md index 8bb16bd3cf373add01a8766a36e70fa236824d49..b9437ac5ca99a7364e567a3fcc1171e2939206a6 100644 --- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md +++ b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md @@ -5,7 +5,7 @@ - Input length: 32 tokens. - Output length: 128 tokens. - Batch size: fixed (8). -- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - CPU Models: llama-3.1 8B. - Evaluation metrics: end-to-end latency (mean, median, p99). @@ -16,7 +16,7 @@ - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Output length: the corresponding output length of these 200 prompts. - Batch size: dynamically determined by vllm to achieve maximum throughput. -- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - CPU Models: llama-3.1 8B. - Evaluation metrics: throughput. @@ -28,7 +28,7 @@ - Output length: the corresponding output length of these 200 prompts. - Batch size: dynamically determined by vllm and the arrival pattern of the requests. - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). -- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2 - CPU Models: llama-3.1 8B. - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py similarity index 58% rename from .buildkite/nightly-benchmarks/scripts/compare-json-results.py rename to .buildkite/performance-benchmarks/scripts/compare-json-results.py index 5ea5a50a258a434cbece3fa82163ce53f4a86e7e..c8bf7b0453662d71dff5a6be0f48d2ceb63785e3 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -7,6 +7,7 @@ from importlib import util import pandas as pd +pd.options.display.float_format = "{:.2f}".format plotly_found = util.find_spec("plotly.express") is not None @@ -109,7 +110,10 @@ def compare_data_columns( if len(compare_frames) >= 2: base = compare_frames[0] current = compare_frames[-1] - ratio = current / base + if "P99" in data_column or "Median" in data_column: + ratio = base / current # for latency + else: + ratio = current / base ratio = ratio.mask(base == 0) # avoid inf when baseline is 0 ratio.name = f"Ratio 1 vs {len(compare_frames)}" frames.append(ratio) @@ -199,6 +203,71 @@ def split_json_by_tp_pp( return saved_paths +def _add_limit_line(fig, y_value, label): + # Visible dashed line + annotation + fig.add_hline( + y=y_value, + line_dash="dash", + line_color="red" if "ttft" in label.lower() else "blue", + annotation_text=f"{label}: {y_value} ms", + annotation_position="top left", + ) + # Optional: add a legend item (as a transparent helper trace) + if plot and plotly_found: + import plotly.graph_objects as go + + fig.add_trace( + go.Scatter( + x=[None], + y=[None], + mode="lines", + line=dict( + dash="dash", color="red" if "ttft" in label.lower() else "blue" + ), + name=f"{label}", + ) + ) + + +def _find_concurrency_col(df: pd.DataFrame) -> str: + for c in [ + "# of max concurrency.", + "# of max concurrency", + "Max Concurrency", + "max_concurrency", + "Concurrency", + ]: + if c in df.columns: + return c + # Fallback: guess an integer-like column (harmless if unused) + for c in df.columns: + if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1: + return c + return "# of max concurrency." + + +def _highlight_threshold( + df: pd.DataFrame, threshold: float +) -> "pd.io.formats.style.Styler": + """Highlight numeric per-configuration columns with value <= threshold.""" + conc_col = _find_concurrency_col(df) + key_cols = [ + c + for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] + if c in df.columns + ] + conf_cols = [ + c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio") + ] + conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] + return df.style.map( + lambda v: "background-color:#e6ffe6;font-weight:bold;" + if pd.notna(v) and v <= threshold + else "", + subset=conf_cols, + ) + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -220,6 +289,26 @@ if __name__ == "__main__": default="# of max concurrency.", help="column name to use as X Axis in comparison graph", ) + parser.add_argument( + "-l", + "--latency", + type=str, + default="p99", + help="take median|p99 for latency like TTFT/TPOT", + ) + parser.add_argument( + "--ttft-max-ms", + type=float, + default=3000.0, + help="Reference limit for TTFT plots (ms)", + ) + parser.add_argument( + "--tpot-max-ms", + type=float, + default=100.0, + help="Reference limit for TPOT plots (ms)", + ) + args = parser.parse_args() drop_column = "P99" @@ -234,12 +323,22 @@ if __name__ == "__main__": "# of max concurrency.", "qps", ] - data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] - html_msgs_for_data_cols = [ - "Compare Output Tokens /n", - "Median TTFT /n", - "Median TPOT /n", - ] + + if "median" in args.latency: + data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] + html_msgs_for_data_cols = [ + "Compare Output Tokens /n", + "Median TTFT /n", + "Median TPOT /n", + ] + drop_column = "P99" + elif "p99" in args.latency: + data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"] + html_msgs_for_data_cols = [ + "Compare Output Tokens /n", + "P99 TTFT /n", + "P99 TPOT /n", + ] if len(args.file) == 1: files = split_json_by_tp_pp(args.file[0], output_root="splits") @@ -275,33 +374,83 @@ if __name__ == "__main__": f"Expected subset: {filtered_info_cols}, " f"but DataFrame has: {list(output_df.columns)}" ) - output_df_sorted = output_df.sort_values(by=existing_group_cols) + # output_df_sorted = output_df.sort_values(by=existing_group_cols) + output_df_sorted = output_df.sort_values(by=args.xaxis) output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) for name, group in output_groups: - html = group.to_html() - text_file.write(html_msgs_for_data_cols[i]) - text_file.write(html) - - if plot and plotly_found: - import plotly.express as px - - df = group[raw_data_cols] - df_sorted = df.sort_values(by=info_cols[y_axis_index]) - # Melt DataFrame for plotting - df_melted = df_sorted.melt( - id_vars=info_cols[y_axis_index], - var_name="Configuration", - value_name=data_cols_to_compare[i], + group_name = ( + ",".join(map(str, name)).replace(",", "_").replace("/", "-") + ) + group_html_name = "perf_comparison_" + group_name + ".html" + + metric_name = str(data_cols_to_compare[i]).lower() + if "tok/s" in metric_name: + html = group.to_html() + elif "ttft" in metric_name: + styler = _highlight_threshold(group, args.ttft_max_ms).format( + {c: "{:.2f}" for c in group.select_dtypes("number").columns}, + na_rep="—", + ) + html = styler.to_html( + table_attributes='border="1" class="dataframe"' + ) + elif ( + "tpot" in metric_name + or "median" in metric_name + or "p99" in metric_name + ): + styler = _highlight_threshold(group, args.tpot_max_ms).format( + {c: "{:.2f}" for c in group.select_dtypes("number").columns}, + na_rep="—", ) - title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] - # Create Plotly line chart - fig = px.line( - df_melted, - x=info_cols[y_axis_index], - y=data_cols_to_compare[i], - color="Configuration", - title=title, - markers=True, + html = styler.to_html( + table_attributes='border="1" class="dataframe"' ) - # Export to HTML - text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) + + text_file.write(html_msgs_for_data_cols[i]) + text_file.write(html) + with open(group_html_name, "a+") as sub_text_file: + sub_text_file.write(html_msgs_for_data_cols[i]) + sub_text_file.write(html) + + if plot and plotly_found: + import plotly.express as px + + df = group[raw_data_cols] + df_sorted = df.sort_values(by=info_cols[y_axis_index]) + # Melt DataFrame for plotting + df_melted = df_sorted.melt( + id_vars=info_cols[y_axis_index], + var_name="Configuration", + value_name=data_cols_to_compare[i], + ) + title = ( + data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] + ) + # Create Plotly line chart + fig = px.line( + df_melted, + x=info_cols[y_axis_index], + y=data_cols_to_compare[i], + color="Configuration", + title=title, + markers=True, + ) + + # ---- Add threshold lines based on metric name ---- + if "ttft" in metric_name: + _add_limit_line(fig, args.ttft_max_ms, "TTFT limit") + elif ( + "tpot" in metric_name + or "median" in metric_name + or "p99" in metric_name + ): + _add_limit_line(fig, args.tpot_max_ms, "TPOT limit") + + # Export to HTML + text_file.write( + fig.to_html(full_html=True, include_plotlyjs="cdn") + ) + sub_text_file.write( + fig.to_html(full_html=True, include_plotlyjs="cdn") + ) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py similarity index 98% rename from .buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py rename to .buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py index 77047636bb9513a96895c1620700e3f6e7482ee8..80bb4d846a22616c0eef7c832b22ef251aa41da2 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py @@ -63,9 +63,11 @@ serving_column_mapping = { "mean_ttft_ms": "Mean TTFT (ms)", "median_ttft_ms": "Median TTFT (ms)", "p99_ttft_ms": "P99 TTFT (ms)", + "std_ttft_ms": "STD TTFT (ms)", "mean_tpot_ms": "Mean TPOT (ms)", "median_tpot_ms": "Median", "p99_tpot_ms": "P99", + "std_tpot_ms": "STD TPOT (ms)", "mean_itl_ms": "Mean ITL (ms)", "median_itl_ms": "Median ITL (ms)", "p99_itl_ms": "P99 ITL (ms)", @@ -368,7 +370,7 @@ if __name__ == "__main__": # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", # we want to turn it into "8xGPUTYPE" df["GPU"] = df["GPU"].apply( - lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}" + lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0]) ) # get markdown tables @@ -390,7 +392,7 @@ if __name__ == "__main__": json_file = "benchmark_results.json" with open(results_folder / md_file, "w") as f: results = read_markdown( - "../.buildkite/nightly-benchmarks/" + "../.buildkite/performance-benchmarks/" + "performance-benchmarks-descriptions.md" ) results = results.format( diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/performance-benchmarks/scripts/launch-server.sh similarity index 97% rename from .buildkite/nightly-benchmarks/scripts/launch-server.sh rename to .buildkite/performance-benchmarks/scripts/launch-server.sh index fb5063db86942a15f3ca5cd3d645b7ae03acd468..ebacdcbd6821ba1e369048af7b9d16e49c397b50 100644 --- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh +++ b/.buildkite/performance-benchmarks/scripts/launch-server.sh @@ -181,18 +181,14 @@ launch_vllm_server() { if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') - server_command="python3 \ - -m vllm.entrypoints.openai.api_server \ + server_command="vllm serve $model \ -tp $tp \ - --model $model \ --port $port \ $server_args" else echo "Key 'fp8' does not exist in common params." - server_command="python3 \ - -m vllm.entrypoints.openai.api_server \ + server_command="vllm serve $model \ -tp $tp \ - --model $model \ --port $port \ $server_args" fi diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh similarity index 95% rename from .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh rename to .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh index b1b7d2d77a44d962fec74cc58680a6c1ed1ade59..99a5a5e334f8ec0f7502bb942671af19863936a9 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -15,6 +15,8 @@ check_gpus() { declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) elif command -v amd-smi; then declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l) + elif command -v hl-smi; then + declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l) fi if [[ $gpu_count -gt 0 ]]; then @@ -23,10 +25,16 @@ check_gpus() { echo "Need at least 1 GPU to run benchmarking." exit 1 fi + + declare -g arch_suffix='' + if command -v nvidia-smi; then declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') elif command -v amd-smi; then declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}') + elif command -v hl-smi; then + declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//') + arch_suffix='-hpu' fi echo "GPU type is $gpu_type" } @@ -138,6 +146,10 @@ kill_gpu_processes() { while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do sleep 1 done + elif command -v hl-smi; then + while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do + sleep 1 + done fi # remove vllm config file @@ -365,8 +377,7 @@ run_serving_tests() { continue fi - server_command="$server_envs python3 \ - -m vllm.entrypoints.openai.api_server \ + server_command="$server_envs vllm serve \ $server_args" # run the server @@ -452,14 +463,10 @@ main() { ARCH='-cpu' else check_gpus + ARCH="$arch_suffix" fi check_hf_token - # Set to v1 to run v1 benchmark - if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then - export VLLM_USE_V1=1 - fi - # dependencies (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) @@ -475,7 +482,12 @@ main() { ensure_sharegpt_downloaded declare -g RESULTS_FOLDER=results/ mkdir -p $RESULTS_FOLDER - QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/ + + # dump vllm info via vllm collect-env + env_output=$(vllm collect-env) + + echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt" # benchmarking run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/performance-benchmarks/tests/genai-perf-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/genai-perf-tests.json rename to .buildkite/performance-benchmarks/tests/genai-perf-tests.json diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json new file mode 100644 index 0000000000000000000000000000000000000000..77d1694ec8641a574bea94c5bc8b092262e7ae3d --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json @@ -0,0 +1,26 @@ +[ + { + "test_name": "latency_llama8B_tp2", + "environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json new file mode 100644 index 0000000000000000000000000000000000000000..296380f72a668b8ce41dc55379d2841d2fd70744 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json @@ -0,0 +1,55 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 256, + "async-scheduling": "" + } + }, + { + "test_name": "latency_llama70B_tp4", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 256, + "async-scheduling": "" + } + }, + { + "test_name": "latency_mixtral8x7B_tp2", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 256, + "async-scheduling": "" + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/performance-benchmarks/tests/latency-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/latency-tests.json rename to .buildkite/performance-benchmarks/tests/latency-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/performance-benchmarks/tests/nightly-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/nightly-tests.json rename to .buildkite/performance-benchmarks/tests/nightly-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json similarity index 80% rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json index ce396d6e54f278cd8605a844469828301dff212f..0b1a42e79025555e3e5925612202d6c40feb1c93 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json @@ -95,6 +95,38 @@ "num_prompts": 200 } }, + { + "test_name": "serving_llama8B_bf16_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, { "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt", "qps_list": ["inf"], @@ -233,6 +265,41 @@ "num_prompts": 1000 } }, + { + "test_name": "serving_llama8B_bf16_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, { "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128", "qps_list": ["inf"], @@ -365,6 +432,38 @@ "num_prompts": 200 } }, + { + "test_name": "serving_llama8B_int8_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, { "test_name": "serving_llama8B_int8_tp2pp3_sharegpt", "qps_list": ["inf"], @@ -503,6 +602,41 @@ "num_prompts": 1000 } }, + { + "test_name": "serving_llama8B_int8_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, { "test_name": "serving_llama8B_int8_tp2pp3_random_128_128", "qps_list": ["inf"], @@ -638,6 +772,39 @@ "num_prompts": 200 } }, + { + "test_name": "serving_llama8B_int4_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, { "test_name": "serving_llama8B_int4_tp2pp3_sharegpt", "qps_list": ["inf"], @@ -780,6 +947,42 @@ "num_prompts": 1000 } }, + { + "test_name": "serving_llama8B_int4_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, { "test_name": "serving_llama8B_int4_tp2pp3_random_128_128", "qps_list": ["inf"], diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json similarity index 52% rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu.json rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu.json index e21c8df0a9fe955e0f36392bc3a65bcd83de620b..f792956f39472e0e6f232aaff3756d7b0c6837a2 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json @@ -2,7 +2,7 @@ { "test_name": "serving_llama8B_tp1_sharegpt", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -28,13 +28,13 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 + "num_prompts": 32 } }, { "test_name": "serving_llama8B_tp2_sharegpt", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -60,13 +60,13 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 + "num_prompts": 32 } }, { - "test_name": "serving_llama8B_tp4_sharegpt", + "test_name": "serving_llama8B_tp1_random_128_128", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -76,11 +76,12 @@ }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, + "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, "trust_remote_code": "", + "enable_chunked_prefill": "", "disable_log_stats": "", "enforce_eager": "", "max_num_batched_tokens": 2048, @@ -90,15 +91,122 @@ "client_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 32 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [32], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 32 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_2048", + "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [32], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048, + "ignore-eos": "", + "num_prompts": 32 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_2048", + "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [32], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048, + "ignore-eos": "", + "num_prompts": 32 } }, { - "test_name": "serving_llama8B_tp4_random_1024_128", + "test_name": "serving_llama8B_tp1_random_2048_128", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -108,7 +216,7 @@ }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, + "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, @@ -124,16 +232,16 @@ "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", - "random-input-len": 1024, + "random-input-len": 2048, "random-output-len": 128, "ignore-eos": "", - "num_prompts": 100 + "num_prompts": 32 } }, { - "test_name": "serving_llama8B_pp6_random_1024_128", + "test_name": "serving_llama8B_tp2_random_2048_128", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -143,7 +251,7 @@ }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", - "pipeline_parallel_size": 6, + "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, @@ -159,10 +267,10 @@ "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", - "random-input-len": 1024, + "random-input-len": 2048, "random-output-len": 128, "ignore-eos": "", - "num_prompts": 100 + "num_prompts": 32 } } ] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json new file mode 100644 index 0000000000000000000000000000000000000000..8c6b34bd9fa33367a020888fddfe0fc3a5ad2108 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json @@ -0,0 +1,82 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 256, + "async-scheduling": "" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 256, + "async-scheduling": "" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_mixtral8x7B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 256, + "async-scheduling": "" + }, + "client_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/performance-benchmarks/tests/serving-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests.json rename to .buildkite/performance-benchmarks/tests/serving-tests.json diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json new file mode 100644 index 0000000000000000000000000000000000000000..dc214ddfb27e3a4d66456cb856f1ecbdac15babd --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json @@ -0,0 +1,27 @@ +[ + { + "test_name": "throughput_llama8B_tp2", + "environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json new file mode 100644 index 0000000000000000000000000000000000000000..3127bf2f6bce376906f419b46424134c86bd97ff --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json @@ -0,0 +1,61 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "" + } + }, + { + "test_name": "throughput_llama70B_tp4", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "" + } + }, + { + "test_name": "throughput_mixtral8x7B_tp2", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "" + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/performance-benchmarks/tests/throughput-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/throughput-tests.json rename to .buildkite/performance-benchmarks/tests/throughput-tests.json diff --git a/.buildkite/pyproject.toml b/.buildkite/pyproject.toml deleted file mode 100644 index d5cad1c73c6f895d446dd2b594915d7af8ac9f7a..0000000000000000000000000000000000000000 --- a/.buildkite/pyproject.toml +++ /dev/null @@ -1,46 +0,0 @@ -# This local pyproject file is part of the migration from yapf to ruff format. -# It uses the same core rules as the main pyproject.toml file, but with the -# following differences: -# - ruff line length is overridden to 88 -# - deprecated typing ignores (UP006, UP035) have been removed - -[tool.ruff] -line-length = 88 - -[tool.ruff.lint.per-file-ignores] -"vllm/third_party/**" = ["ALL"] -"vllm/version.py" = ["F401"] -"vllm/_version.py" = ["ALL"] - -[tool.ruff.lint] -select = [ - # pycodestyle - "E", - # Pyflakes - "F", - # pyupgrade - "UP", - # flake8-bugbear - "B", - # flake8-simplify - "SIM", - # isort - "I", - # flake8-logging-format - "G", -] -ignore = [ - # star imports - "F405", "F403", - # lambda expression assignment - "E731", - # Loop control variable not used within loop body - "B007", - # f-string format - "UP032", - # Can remove once 3.10+ is the minimum Python version - "UP007", -] - -[tool.ruff.format] -docstring-code-format = true diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 7677d783fabc26ae34fae7b579df3cde123d1cc2..38c400ba1faf5037e7ed0115fc489c1333218f66 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,5 +1,5 @@ steps: - # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9 + # aarch64 + CUDA builds - label: "Build arm64 wheel - CUDA 12.9" depends_on: ~ id: build-wheel-arm64-cuda-12-9 @@ -8,13 +8,28 @@ steps: commands: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" + # aarch64 build + - label: "Build arm64 CPU wheel" + depends_on: ~ + id: build-wheel-arm64-cpu + agents: + queue: arm64_cpu_queue_postmerge + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/scripts/upload-wheels.sh" + env: + DOCKER_BUILDKIT: "1" + + # x86 + CUDA builds - label: "Build wheel - CUDA 12.8" depends_on: ~ id: build-wheel-cuda-12-8 @@ -28,33 +43,33 @@ steps: env: DOCKER_BUILDKIT: "1" - - label: "Build wheel - CUDA 12.6" + - label: "Build wheel - CUDA 12.9" depends_on: ~ - id: build-wheel-cuda-12-6 + id: build-wheel-cuda-12-9 agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" - # x86 + CUDA builds - - label: "Build wheel - CUDA 12.9" + - label: "Build wheel - CUDA 13.0" depends_on: ~ - id: build-wheel-cuda-12-9 + id: build-wheel-cuda-13-0 agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" + # Build release images (12.9) - label: "Build release image (x86)" depends_on: ~ id: build-release-image-x86 @@ -62,13 +77,12 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" # re-tag to default image tag and push, just in case arm64 build fails - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9 - label: "Build release image (arm64)" depends_on: ~ id: build-release-image-arm64 @@ -76,7 +90,7 @@ steps: queue: arm64_cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" # Add job to create multi-arch manifest @@ -102,24 +116,6 @@ steps: commands: - "bash .buildkite/scripts/annotate-release.sh" - - label: "Build and publish TPU release image" - depends_on: ~ - if: build.env("NIGHTLY") == "1" - agents: - queue: tpu_queue_postmerge - commands: - - "yes | docker system prune -a" - - "git fetch --all" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ." - - "docker push vllm/vllm-tpu:nightly" - - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" - plugins: - - docker-login#v3.0.0: - username: vllmbot - password-env: DOCKERHUB_TOKEN - env: - DOCKER_BUILDKIT: "1" - - input: "Provide Release version here" id: input-release-version fields: @@ -136,12 +132,28 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" env: DOCKER_BUILDKIT: "1" + - block: "Build arm64 CPU release image" + key: block-arm64-cpu-release-image-build + depends_on: ~ + + - label: "Build and publish arm64 CPU release image" + depends_on: block-arm64-cpu-release-image-build + agents: + queue: arm64_cpu_queue_postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest" + - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)" + env: + DOCKER_BUILDKIT: "1" + - label: "Build and publish nightly multi-arch image to DockerHub" depends_on: - create-multi-arch-manifest @@ -150,11 +162,16 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly" - - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" - - "docker push vllm/vllm-openai:nightly" - - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" + - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64" + - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64" + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64" + - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64" + - "docker push vllm/vllm-openai:nightly-x86_64" + - "docker push vllm/vllm-openai:nightly-aarch64" + - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend" + - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend" + - "docker manifest push vllm/vllm-openai:nightly" + - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" # Clean up old nightly builds (keep only last 14) - "bash .buildkite/scripts/cleanup-nightly-builds.sh" plugins: @@ -163,3 +180,4 @@ steps: password-env: DOCKERHUB_TOKEN env: DOCKER_BUILDKIT: "1" + DOCKERHUB_USERNAME: "vllmbot" diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index fde48603ad3cd1dc01b455e291f23badb02037e2..56bb5cedaa0a96259f2b4159f1abeaca8061cf0e 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -2,16 +2,23 @@ set -ex -# Get release version and strip leading 'v' if present -RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//') - -if [ -z "$RELEASE_VERSION" ]; then - echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid." - exit 1 +# Get release version, default to 1.0.0.dev for nightly/per-commit builds +RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//') +if [ -z "${RELEASE_VERSION}" ]; then + RELEASE_VERSION="1.0.0.dev" fi buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF -To download the wheel: +To download the wheel (by commit): +\`\`\` +aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . + +aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . +\`\`\` + +To download the wheel (by version): \`\`\` aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh index 1a82f7d085233bdc8262a34201cb610b3ca393b4..f02a128c6772665aa7996f9f4442ebc4f5219e0d 100755 --- a/.buildkite/scripts/cleanup-nightly-builds.sh +++ b/.buildkite/scripts/cleanup-nightly-builds.sh @@ -8,20 +8,41 @@ set -ex # DockerHub API endpoint for vllm/vllm-openai repository REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" -# Get DockerHub token from environment +# Get DockerHub credentials from environment if [ -z "$DOCKERHUB_TOKEN" ]; then echo "Error: DOCKERHUB_TOKEN environment variable is not set" exit 1 fi +if [ -z "$DOCKERHUB_USERNAME" ]; then + echo "Error: DOCKERHUB_USERNAME environment variable is not set" + exit 1 +fi + +# Get DockerHub bearer token +echo "Getting DockerHub bearer token..." +set +x +BEARER_TOKEN=$(curl -s -X POST \ + -H "Content-Type: application/json" \ + -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \ + "https://hub.docker.com/v2/users/login" | jq -r '.token') +set -x + +if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then + echo "Error: Failed to get DockerHub bearer token" + exit 1 +fi + # Function to get all tags from DockerHub get_all_tags() { local page=1 local all_tags="" while true; do - local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \ + set +x + local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \ "$REPO_API_URL?page=$page&page_size=100") + set -x # Get both last_updated timestamp and tag name, separated by | local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"') @@ -43,7 +64,9 @@ delete_tag() { echo "Deleting tag: $tag_name" local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name" - local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url") + set +x + local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url") + set -x if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')" diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index aa4cc7b35a5434013be24a5baecb748e283347a2..864eb470bb0a7e33cd9067ec912a30a3e2722ee3 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -59,7 +59,7 @@ while true; do fi done -echo "--- Pulling container" +echo "--- Pulling container" image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" docker pull "${image_name}" @@ -78,17 +78,13 @@ HF_MOUNT="/root/.cache/huggingface" commands=$@ echo "Commands:$commands" -if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then - commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"} -fi +commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"} if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} fi -if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then - commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} -fi +commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"} if [[ $commands == *"pytest -v -s lora"* ]]; then commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"} @@ -173,19 +169,28 @@ fi PARALLEL_JOB_COUNT=8 MYPYTHONPATH=".." -# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. +# Test that we're launching on the machine that has +# proper access to GPUs +render_gid=$(getent group render | cut -d: -f3) +if [[ -z "$render_gid" ]]; then + echo "Error: 'render' group not found. This is required for GPU access." >&2 + exit 1 +fi + +# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. if [[ $commands == *"--shard-id="* ]]; then - # assign job count as the number of shards used - commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} + # assign job count as the number of shards used + commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g') for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do # assign shard-id for each shard - commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "} + commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g') echo "Shard ${GPU} commands:$commands_gpu" echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" docker run \ --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ --network=host \ --shm-size=16gb \ + --group-add "$render_gid" \ --rm \ -e HIP_VISIBLE_DEVICES="${GPU}" \ -e HF_TOKEN \ @@ -217,8 +222,8 @@ else --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ --network=host \ --shm-size=16gb \ + --group-add "$render_gid" \ --rm \ - -e HIP_VISIBLE_DEVICES=0 \ -e HF_TOKEN \ -e AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY \ diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh index 36bcb015d308ebf096662060c4e9726db10e0927..39ea1801730819b713c1d1e868f61bf7af241797 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh @@ -25,25 +25,28 @@ function cpu_tests() { # offline inference podman exec -it "$container_id" bash -c " - set -e - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + set -xve + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log # Run basic model test podman exec -it "$container_id" bash -c " - set -e + set -evx pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib pip install sentence-transformers datamodel_code_generator - pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model + + # Note: disable Bart until supports V1 + # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] - pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" + # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. + # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log } # All of CPU tests are expected to be finished less than 40 mins. export container_id export -f cpu_tests -timeout 40m bash -c cpu_tests +timeout 120m bash -c cpu_tests diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 7512cb1bbed016ea37d5645fdbcb3f20c9cbd540..7479c43977d780af179d0a0fd21ebc7041425988 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -49,6 +49,7 @@ function cpu_tests() { # Run kernel tests docker exec cpu-test-"$NUMA_NODE" bash -c " set -e + pytest -x -v -s tests/kernels/attention/test_cpu_attn.py pytest -x -v -s tests/kernels/test_onednn.py" # Run basic model test @@ -70,13 +71,13 @@ function cpu_tests() { docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -x -s -v \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs" # Note: disable it until supports V1 # Run AWQ test # docker exec cpu-test-"$NUMA_NODE" bash -c " # set -e - # VLLM_USE_V1=0 pytest -x -s -v \ + # pytest -x -s -v \ # tests/quantization/test_ipex_quant.py" # Run multi-lora tests @@ -116,4 +117,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 2.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh new file mode 100644 index 0000000000000000000000000000000000000000..29c8f5ed5a91aeefb14a74fb722d071741dda971 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh @@ -0,0 +1,191 @@ +#!/bin/bash + +# This script build the Ascend NPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Base ubuntu image with basic ascend development libraries and python installed +VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git" +CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg" +TEST_RUN_CONFIG_FILE="vllm_test.cfg" +VLLM_ASCEND_TMP_DIR= +# Get the test run configuration file from the vllm-ascend repository +fetch_vllm_test_cfg() { + VLLM_ASCEND_TMP_DIR=$(mktemp -d) + # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval + cleanup() { + rm -rf "${VLLM_ASCEND_TMP_DIR}" + } + trap cleanup EXIT + + GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}" + if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then + echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2 + exit 1 + fi + + # If the file already exists locally, just overwrite it + cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}" + echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}" + + # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources + # when the trap is abnormal has been completed, so the temporary resources are manually deleted here. + rm -rf "${VLLM_ASCEND_TMP_DIR}" + trap - EXIT +} + +# Downloads test run configuration file from a remote URL. +# Loads the configuration into the current script environment. +get_config() { + if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then + echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2 + exit 1 + fi + source "${TEST_RUN_CONFIG_FILE}" + echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}" + return 0 +} + +# get test running configuration. +fetch_vllm_test_cfg +get_config +# Check if the function call was successful. If not, exit the script. +if [ $? -ne 0 ]; then + exit 1 +fi + +image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}" +container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" + +# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards +agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}') +echo "agent_idx: ${agent_idx}" +builder_name="cachebuilder${agent_idx}" +builder_cache_dir="/mnt/docker-cache${agent_idx}" +mkdir -p ${builder_cache_dir} + +# Try building the docker image +cat <=6.0 modelscope + +WORKDIR /workspace/vllm + +# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid. +COPY requirements/common.txt /workspace/vllm/requirements/common.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install -r requirements/common.txt + +COPY . . + +# Install vLLM +RUN --mount=type=cache,target=/root/.cache/pip \ + VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton + +# Install vllm-ascend +WORKDIR /workspace +ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git +ARG VLLM_ASCEND_TAG=main +RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \ + git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend + +# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid. +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install -r /workspace/vllm-ascend/requirements.txt + +RUN --mount=type=cache,target=/root/.cache/pip \ + export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ + +ENV VLLM_WORKER_MULTIPROC_METHOD=spawn +ENV VLLM_USE_MODELSCOPE=True + +WORKDIR /workspace/vllm-ascend + +CMD ["/bin/bash"] + +EOF + +# Setup cleanup +remove_docker_container() { + docker rm -f "${container_name}" || true; + docker image rm -f "${image_name}" || true; + docker system prune -f || true; +} +trap remove_docker_container EXIT + +# Generate corresponding --device args based on BUILDKITE_AGENT_NAME +# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1. +# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards. +# returns --device /dev/davinci0 --device /dev/davinci1 +parse_and_gen_devices() { + local input="$1" + local index cards_num + if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then + index="${BASH_REMATCH[1]}" + cards_num="${BASH_REMATCH[2]}" + else + echo "parse error" >&2 + return 1 + fi + + local devices="" + local i=0 + while (( i < cards_num )); do + local dev_idx=$(((index - 1)*cards_num + i )) + devices="$devices --device /dev/davinci${dev_idx}" + ((i++)) + done + + # trim leading space + devices="${devices#"${devices%%[![:space:]]*}"}" + # Output devices: assigned to the caller variable + printf '%s' "$devices" +} + +devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1 + +# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware. +# This test checks whether the OOT platform interface is functioning properly in conjunction with +# the hardware plugin vllm-ascend. +model_cache_dir=/mnt/modelscope${agent_idx} +mkdir -p ${model_cache_dir} +docker run \ + ${devices} \ + --device /dev/davinci_manager \ + --device /dev/devmm_svm \ + --device /dev/hisi_hdc \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ + -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -v ${model_cache_dir}:/root/.cache/modelscope \ + --entrypoint="" \ + --name "${container_name}" \ + "${image_name}" \ + bash -c ' + set -e + pytest -v -s tests/e2e/vllm_interface/ +' diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index e76528a1782058d10d8ee9fb62ac8a7b19b96f04..cbb2527a4ff0aa30664f340c61f75833db469e2a 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" -export VLLM_USE_V1=1 + export VLLM_XLA_CHECK_RECOMPILATION=1 export VLLM_XLA_CACHE_PATH= -echo "Using VLLM V1" echo "--- Hardware Information ---" # tpu-info diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 69366cd503219864f59a4cacdd3d33417c1cec22..f022fa3672eeba2774d51e34b928ea25c7d4d6e6 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 echo "--- Python dependencies installed ---" -export VLLM_USE_V1=1 + export VLLM_XLA_CHECK_RECOMPILATION=1 export VLLM_XLA_CACHE_PATH= -echo "Using VLLM V1" echo "--- Hardware Information ---" # tpu-info diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 1fc3dbd8c21f4c46e5e85301d51bd39140a1e72c..d49f3e2f47cf1e3e26df72e1c6bb0bff00b9185b 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -20,7 +20,10 @@ trap remove_docker_container EXIT # Run the image and test offline inference/tensor parallel docker run \ - --device /dev/dri \ + --device /dev/dri:/dev/dri \ + --net=host \ + --ipc=host \ + --privileged \ -v /dev/dri/by-path:/dev/dri/by-path \ --entrypoint="" \ -e "HF_TOKEN=${HF_TOKEN}" \ @@ -42,9 +45,7 @@ docker run \ pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py - pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py + pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py + pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_utils.py - pytest -v -s v1/test_metrics_reader.py ' diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh index 72812218cb668368bfcf5b2811acf4bfb74102a2..51536b36b808de2a2bf6c2977d3990a2d90a115c 100644 --- a/.buildkite/scripts/run-benchmarks.sh +++ b/.buildkite/scripts/run-benchmarks.sh @@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_ bench_throughput_exit_code=$? # run server-based benchmarks and upload the result to buildkite -python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf & +vllm serve meta-llama/Llama-2-7b-chat-hf & server_pid=$! wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh new file mode 100644 index 0000000000000000000000000000000000000000..5302f524a0ae445fc3ae957a072c72453e50ddf6 --- /dev/null +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +set -euxo pipefail + +# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] +THRESHOLD=${1:-0.25} +NUM_Q=${2:-1319} +PORT=${3:-8010} +OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} +mkdir -p "${OUT_DIR}" + +wait_for_server() { + local port=$1 + timeout 600 bash -c ' + until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do + sleep 1 + done' +} + +MODEL="deepseek-ai/DeepSeek-V2-lite" +BACKENDS=("deepep_high_throughput" "deepep_low_latency") + +cleanup() { + if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then + kill "${SERVER_PID}" 2>/dev/null || true + for _ in {1..20}; do + kill -0 "${SERVER_PID}" 2>/dev/null || break + sleep 0.5 + done + kill -9 "${SERVER_PID}" 2>/dev/null || true + fi +} +trap cleanup EXIT + +for BACK in "${BACKENDS[@]}"; do + VLLM_DEEP_GEMM_WARMUP=skip \ + VLLM_ALL2ALL_BACKEND=$BACK \ + vllm serve "$MODEL" \ + --enforce-eager \ + --tensor-parallel-size 2 \ + --data-parallel-size 2 \ + --enable-expert-parallel \ + --enable-eplb \ + --trust-remote-code \ + --max-model-len 2048 \ + --port $PORT & + SERVER_PID=$! + wait_for_server $PORT + + TAG=$(echo "$MODEL" | tr '/: \\n' '_____') + OUT="${OUT_DIR}/${TAG}_${BACK}.json" + python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} + python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" +PY + + cleanup + SERVER_PID= + sleep 1 + PORT=$((PORT+1)) +done diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh new file mode 100644 index 0000000000000000000000000000000000000000..a5135299297e25d69e6baf1aaad3d196c533f76a --- /dev/null +++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +set -euxo pipefail + +# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] +THRESHOLD=${1:-0.8} +NUM_Q=${2:-1319} +PORT=${3:-8020} +OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} +mkdir -p "${OUT_DIR}" + +wait_for_server() { + local port=$1 + timeout 600 bash -c ' + until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do + sleep 1 + done' +} + +MODEL="QWen/Qwen3-30B-A3B-FP8" +BACKENDS=("deepep_high_throughput" "deepep_low_latency") + +cleanup() { + if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then + kill "${SERVER_PID}" 2>/dev/null || true + for _ in {1..20}; do + kill -0 "${SERVER_PID}" 2>/dev/null || break + sleep 0.5 + done + kill -9 "${SERVER_PID}" 2>/dev/null || true + fi +} +trap cleanup EXIT + +for BACK in "${BACKENDS[@]}"; do + VLLM_DEEP_GEMM_WARMUP=skip \ + VLLM_ALL2ALL_BACKEND=$BACK \ + vllm serve "$MODEL" \ + --enforce-eager \ + --tensor-parallel-size 2 \ + --data-parallel-size 2 \ + --enable-expert-parallel \ + --trust-remote-code \ + --max-model-len 2048 \ + --port $PORT & + SERVER_PID=$! + wait_for_server $PORT + + TAG=$(echo "$MODEL" | tr '/: \\n' '_____') + OUT="${OUT_DIR}/${TAG}_${BACK}.json" + python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} + python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" +PY + + cleanup + SERVER_PID= + sleep 1 + PORT=$((PORT+1)) +done diff --git a/.buildkite/scripts/tpu/quantized_v6e_1.env b/.buildkite/scripts/tpu/quantized_v6e_1.env index bd25c803081a677b04faa7f45acf8629e97eb877..ecb98d4516bd533e02325fb0e68d8a14da7bf48b 100644 --- a/.buildkite/scripts/tpu/quantized_v6e_1.env +++ b/.buildkite/scripts/tpu/quantized_v6e_1.env @@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024 TENSOR_PARALLEL_SIZE=1 MAX_MODEL_LEN=2048 DOWNLOAD_DIR=/mnt/disks/persist -EXPECTED_THROUGHPUT=10.0 +EXPECTED_THROUGHPUT=8.7 INPUT_LEN=1800 OUTPUT_LEN=128 diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh index b1e17b438578d4a071f7b870b6c24ae6dc5d160e..3364fce8e1fdc2ffdf1c215266b6d496356c40b1 100755 --- a/.buildkite/scripts/tpu/run_bm.sh +++ b/.buildkite/scripts/tpu/run_bm.sh @@ -42,7 +42,7 @@ echo "lanching vllm..." echo "logging to $VLLM_LOG" echo -VLLM_USE_V1=1 vllm serve $MODEL \ +vllm serve $MODEL \ --seed 42 \ --max-num-seqs $MAX_NUM_SEQS \ --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 43aa8c47be2992975a9264d2da7f0e6d06698d4b..945c5e48c00902c1947dc50b586acf0ae02881b0 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel" aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" -if [[ $normal_wheel == *"cu126"* ]]; then - # if $normal_wheel matches cu126, do not upload the index.html - echo "Skipping index files for cu126 wheels" -elif [[ $normal_wheel == *"cu128"* ]]; then - # if $normal_wheel matches cu128, do not upload the index.html - echo "Skipping index files for cu128 wheels" -else +if [[ $normal_wheel == *"cu129"* ]]; then # only upload index.html for cu129 wheels (default wheels) as it # is available on both x86 and arm64 aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" +else + echo "Skipping index files for non-cu129 wheels" fi # generate index for nightly aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" -if [[ $normal_wheel == *"cu126"* ]]; then - # if $normal_wheel matches cu126, do not upload the index.html - echo "Skipping index files for cu126 wheels" -elif [[ $normal_wheel == *"cu128"* ]]; then - # if $normal_wheel matches cu128, do not upload the index.html - echo "Skipping index files for cu128 wheels" -else +if [[ $normal_wheel == *"cu129"* ]]; then # only upload index.html for cu129 wheels (default wheels) as it # is available on both x86 and arm64 aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" +else + echo "Skipping index files for non-cu129 wheels" fi aws s3 cp "$wheel" "s3://vllm-wheels/$version/" diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2471b509a9fff761d242080d94f37b9639c46f89 --- /dev/null +++ b/.buildkite/test-amd.yaml @@ -0,0 +1,1476 @@ +# In this file, you can add more tests to run either by adding a new step or +# adding a new command to an existing step. See different options here for examples. + +# This script will be feed into Jinja template in `test-template-aws.j2` at +# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 +# to generate the final pipeline yaml file. + +# Documentation +# label(str): the name of the test. emojis allowed. +# fast_check(bool): whether to run this on each commit on the fastcheck pipeline. +# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline. +# fast_check_only(bool): run this test on the fastcheck pipeline only +# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run. +# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests). +# command(str): the single command to run for tests. incompatible with commands. +# commands(list): the list of commands to run for the test. incompatible with command. +# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental] +# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200 +# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4. +# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host, +# in this case, commands must be specified. the first command runs on the first host, the second +# command runs on the second host. +# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout. +# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB +# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables. +# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests +# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run. + +# When adding a test +# - If the test belongs to an existing group, add it there +# - If the test is short, add to any existing step +# - If the test takes more than 10min, then it is okay to create a new step. +# Note that all steps execute in parallel. + +steps: +##### fast check tests ##### + +- label: Pytorch Nightly Dependency Override Check # 2min + # if this test fails, it means the nightly torch version is not compatible with some + # of the dependencies. Please check the error message and add the package to whitelist + # in /vllm/tools/pre_commit/generate_nightly_torch_test.py + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + soft_fail: true + source_file_dependencies: + - requirements/nightly_torch_test.txt + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh + +- label: Async Engine, Inputs, Utils, Worker Test # 10min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/multimodal + - tests/standalone_tests/lazy_imports.py + - tests/transformers_utils + no_gpu: true + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s transformers_utils + +- label: Python-only Installation Test # 10min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + +- label: Basic Correctness Test # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py + +- label: Entrypoints Unit Tests # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + working_dir: "/vllm-workspace/tests" + fast_check: true + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: Entrypoints Integration Test (LLM) # 30min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + +- label: Entrypoints Integration Test (API Server) # 100min + timeout_in_minutes: 130 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/test_chat_utils.py + +- label: Entrypoints Integration Test (Pooling) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + +- label: Distributed Tests (4 GPUs) # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_utils + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/test_basic_correctness + - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - tests/examples/offline_inference/data_parallel.py + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py + commands: + # test with torchrun tp=2 and external_dp=2 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=2 and pp=2 + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=4 and dp=1 + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2, pp=2 and dp=1 + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=1 and dp=4 with ep + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2 and dp=2 with ep + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with internal dp + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - pushd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd + +- label: Distributed Tests (8 GPUs) # 4min + timeout_in_minutes: 10 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + gpu: h100 + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py + - vllm/distributed/ + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + #- export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and dp=4 with ep + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + +- label: EPLB Algorithm Test # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + +- label: EPLB Execution Test # 10min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py + +- label: Metrics, Tracing Test # 12min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/v1/tracing + commands: + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" + - pytest -v -s v1/tracing + +##### fast check tests ##### +##### 1 GPU test ##### + +- label: Regression Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + agent_pool: mi325_1 + grade: Blocking + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: Engine Test # 25min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/engine + - tests/tokenization + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + # OOM in the CI unless we run this separately + - pytest -v -s tokenization + +- label: V1 Test e2e + engine # 30min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + - pytest -v -s v1/engine + +- label: V1 Test entrypoints # 35min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: V1 Test others # 42min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # split the test to avoid interference + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + # Integration test for streaming correctness (requires special branch). + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +# TODO: Add the "V1 Test attetion (MI300)" test group + +- label: V1 Test attention (H100) # 10min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 30 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - pytest -v -s v1/attention + +- label: V1 Test others (CPU) # 5 mins + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/v1 + no_gpu: true + commands: + # split the test to avoid interference + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + + +- label: Examples Test # 30min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - examples/ + commands: + - pip install tensorizer # for tensorizer test + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/chat.py + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_pooling.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + #- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + +- label: Platform Tests (CUDA) # 4min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + +- label: Samplers Test # 56min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + - tests/conftest.py + commands: + - pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers + +- label: LoRA Test %N # 20min each + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - pytest -v -s lora \ + --shard-id=$$BUILDKITE_PARALLEL_JOB \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --ignore=lora/test_chatglm3_tp.py \ + --ignore=lora/test_llama_tp.py \ + --ignore=lora/test_llm_with_multi_loras.py \ + --ignore=lora/test_olmoe_tp.py \ + --ignore=lora/test_deepseekv2_tp.py \ + --ignore=lora/test_gptoss_tp.py \ + --ignore=lora/test_qwen3moe_tp.py + parallelism: 4 + +- label: PyTorch Compilation Unit Tests # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_pass_manager.py + - pytest -v -s compile/test_fusion.py + - pytest -v -s compile/test_fusion_attn.py + - pytest -v -s compile/test_functionalization.py + - pytest -v -s compile/test_silu_mul_quant_fusion.py + # - pytest -v -s compile/test_sequence_parallelism.py + # - pytest -v -s compile/test_async_tp.py + - pytest -v -s compile/test_fusion_all_reduce.py + - pytest -v -s compile/test_decorator.py + - pytest -v -s compile/test_noop_elimination.py + - pytest -v -s compile/test_aot_compile.py + +- label: PyTorch Fullgraph Smoke Test # 15min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/test_multimodal_compile.py + - pytest -v -s compile/piecewise/ + +- label: PyTorch Fullgraph Test # 27min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + # Limit to no custom ops to reduce running time + # Wrap with quotes to escape yaml and avoid starting -k string with a - + - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" + +- label: Cudagraph test + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + source_file_dependencies: + - tests/v1/cudagraph + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - vllm/compilation + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py + +- label: Kernels Core Operation Test # 48min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - tests/kernels/core + - tests/kernels/test_top_k_per_row.py + commands: + - pytest -v -s kernels/core kernels/test_top_k_per_row.py + +- label: Kernels Attention Test %N # 23min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - csrc/attention/ + - vllm/attention + - vllm/v1/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Quantization Test %N # 64min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels MoE Test %N # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Mamba Test # 31min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops + commands: + - pytest -v -s kernels/mamba + +- label: Model Executor Test # 23min + timeout_in_minutes: 35 + torch_nightly: true + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/engine/arg_utils.py + - vllm/config/model.py + - vllm/model_executor + - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py + +- label: Benchmarks # 11min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + # grade: Blocking + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + commands: + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ + +- label: Quantization Test # 70min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + commands: + # temporary install here since we need nightly, will move to requirements/test.in + # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py + +- label: LM Eval Small Models # 15min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + +- label: OpenAI API correctness # 10min + timeout_in_minutes: 15 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + commands: # LMEval + # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 + - pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py + +- label: OpenAI-Compatible Tool Use # 23 min + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + fast_check: false + source_file_dependencies: + - vllm/ + - tests/tool_use + commands: + - pytest -v -s -m 'not cpu_test' tool_use + +- label: OpenAI-Compatible Tool Use (CPU) # 5 mins + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/tool_use + no_gpu: true + commands: + - pytest -v -s -m 'cpu_test' tool_use + +##### models test ##### + +- label: Basic Models Tests (Initialization) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_initialization.py + commands: + # Run a subset of model initialization tests + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Extra Initialization) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_8 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/test_initialization.py + commands: + # Only when vLLM model source is modified - test initialization of a large + # subset of supported models (the complement of the small subset in the above + # test.) Also run if model initialization test file is modified + - pytest -v -s models/test_initialization.py \ + -k 'not test_can_initialize_small_subset' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Basic Models Tests (Other) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + timeout_in_minutes: 10 + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + no_gpu: true + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Language Models Tests (Standard) + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + # Test standard language models, excluding a subset of slow tests + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Language Models Tests (Extra Standard) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + # Shard slow subset of standard language models tests. Only run when model + # source is modified, or when specified test files are modified + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Tests (Hybrid) %N + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_8 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + # Shard hybrid language model tests + - pytest -v -s models/language/generation \ + -m hybrid_model \ + --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ + --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + +- label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test + +- label: Multi-Modal Processor Test # 44min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing + +- label: Multi-Modal Models Test (Standard) # 60min + timeout_in_minutes: 80 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Accuracy Eval (Small Models) # 10min + timeout_in_minutes: 70 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + +- label: Multi-Modal Models Test (Extended) 1 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing + +- label: Multi-Modal Models Test (Extended) 2 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models Test (Extended) 3 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + +- label: Quantized Models Test # 45 min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + commands: + - pytest -v -s models/quantization + +# This test is used only in PR development phase to test individual models and should never run on main +- label: Custom Models Test + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + optional: true + commands: + - echo 'Testing custom models...' + # PR authors can temporarily add commands below to test individual models + # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py + # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* + +- label: Transformers Nightly Models Test + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/" + optional: true + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py + - python3 examples/offline_inference/basic/chat.py + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # Whisper needs spawn method to avoid deadlock + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper + +- label: Blackwell Test # 21 min + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/" + gpu: b200 + # optional: true + source_file_dependencies: + - csrc/quantization/fp4/ + - csrc/attention/mla/ + - csrc/quantization/cutlass_w8a8/moe/ + - vllm/model_executor/layers/fused_moe/cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/attention/backends/mla/cutlass_mla.py + - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/platforms/cuda.py + - vllm/attention/selector.py + commands: + - nvidia-smi + - python3 examples/offline_inference/basic/chat.py + # Attention + # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_attention_selector.py + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + # Quantization + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + - pytest -v -s tests/kernels/moe/test_flashinfer.py + +- label: Blackwell Fusion Tests # 30 min + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + commands: + - nvidia-smi + - pytest -v -s tests/compile/test_fusion_attn.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + # this runner has 2 GPUs available even though num_gpus=2 is not set + - pytest -v -s tests/compile/test_fusion_all_reduce.py + # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time + # Wrap with quotes to escape yaml + - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" + +- label: Blackwell Fusion E2E Tests # 30 min + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true + num_gpus: 2 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusions_e2e.py + - tests/compile/test_full_graph.py + commands: + - nvidia-smi + # Run all e2e fusion tests + - pytest -v -s tests/compile/test_fusions_e2e.py + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile + +- label: ROCm GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + agent_pool: mi325_1 + mirror_hardwares: [amdproduction] + optional: true # run on nightlies + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: Blackwell Quantized MoE Test + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - tests/quantization/test_blackwell_moe.py + - vllm/model_executor/models/deepseek_v2.py + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/models/llama4.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization/compressed_tensors + - vllm/model_executor/layers/quantization/modelopt.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py + +- label: Blackwell LM Eval Small Models + timeout_in_minutes: 120 + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + +##### 1 GPU test ##### +##### multi gpus test ##### + +- label: Distributed Comm Ops Test # 7min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + +- label: 2 Node Tests (4 GPUs in total) # 16min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - tests/examples/offline_inference/data_parallel.py + commands: + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' + - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + +- label: Distributed Tests (2 GPUs) # 68min + timeout_in_minutes: 90 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/distributed/ + - tests/entrypoints/llm/test_collective_rpc.py + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + commands: + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - pytest -v -s distributed/test_sequence_parallel.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Model Tests (2 GPUs) # 37min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' + +- label: Plugin Tests (2 GPUs) # 40min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # end io_processor plugins test + # begin stat_logger plugins test + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + # end stat_logger plugins test + # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins + +- label: Pipeline + Context Parallelism Test # 45min + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py + +- label: LoRA TP Test (Distributed) # 17 min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # There is some Tensor Parallelism related processing logic in LoRA that + # requires multi-GPU testing for validation. + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + - pytest -v -s -x lora/test_gptoss_tp.py + + +- label: Weight Loading Multiple GPU Test # 33min + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + +- label: Weight Loading Multiple GPU Test - Large Models # optional + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + gpu: a100 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + +- label: NixlConnector PD accuracy tests (Distributed) # 30min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh + +##### multi gpus test ##### +##### A100 test ##### + +- label: Distributed Tests (A100) # optional + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + gpu: a100 + optional: true + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + +- label: LM Eval Large Models # optional + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + gpu: a100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +##### H100 test ##### +- label: LM Eval Large Models (H100) # optional + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_4 + # grade: Blocking + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + +##### H200 test ##### +- label: Distributed Tests (H200) # optional + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + gpu: h200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/compile/test_async_tp.py + - pytest -v -s tests/compile/test_sequence_parallelism.py + - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - pytest -v -s tests/v1/distributed/test_dbo.py + +##### B200 test ##### +- label: Distributed Tests (B200) # optional + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py + +##### RL Integration Tests ##### +- label: Prime-RL Integration Test # 15min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh + +- label: DeepSeek V2-Lite Accuracy + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 + +- label: Qwen3-30B-A3B-FP8-block Accuracy + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020 diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0914c899aa5b8878af5ba2be2ced9724f98620c0..4ac76aba67b9cde147b55907c702c01cab586954 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -25,6 +25,7 @@ # and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables. # working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests # source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run. +# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch. # When adding a test # - If the test belongs to an existing group, add it there @@ -38,7 +39,7 @@ steps: - label: Pytorch Nightly Dependency Override Check # 2min # if this test fails, it means the nightly torch version is not compatible with some # of the dependencies. Please check the error message and add the package to whitelist - # in /vllm/tools/generate_nightly_torch_test.py + # in /vllm/tools/pre_commit/generate_nightly_torch_test.py soft_fail: true source_file_dependencies: - requirements/nightly_torch_test.txt @@ -50,19 +51,30 @@ steps: mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ - tests/test_inputs.py - tests/test_outputs.py - tests/multimodal - - tests/utils_ - tests/standalone_tests/lazy_imports.py - tests/transformers_utils + - tests/config + no_gpu: true commands: - python3 standalone_tests/lazy_imports.py - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - - pytest -v -s multimodal - - pytest -v -s utils_ # Utils - - pytest -v -s transformers_utils # transformers_utils + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s transformers_utils + - pytest -v -s config - label: Python-only Installation Test # 10min timeout_in_minutes: 20 @@ -159,13 +171,12 @@ steps: - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py - - tests/v1/test_async_llm_dp.py - - tests/v1/test_external_lb_dp.py - - tests/v1/test_internal_lb_dp.py - - tests/v1/test_hybrid_lb_dp.py + - tests/v1/distributed - tests/v1/engine/test_engine_core_client.py - tests/distributed/test_symm_mem_allreduce.py commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 # test with torchrun tp=2 and external_dp=2 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with torchrun tp=2 and pp=2 @@ -180,10 +191,10 @@ steps: - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py @@ -197,6 +208,24 @@ steps: - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd +- label: Distributed Tests (8 GPUs) # 4min + timeout_in_minutes: 10 + gpu: h100 + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py + - vllm/distributed/ + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and dp=4 with ep + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + - label: EPLB Algorithm Test # 5min timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" @@ -206,8 +235,8 @@ steps: commands: - pytest -v -s distributed/test_eplb_algo.py -- label: EPLB Execution Test # 5min - timeout_in_minutes: 15 +- label: EPLB Execution Test # 10min + timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: @@ -215,6 +244,7 @@ steps: - tests/distributed/test_eplb_execute.py commands: - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py - label: Metrics, Tracing Test # 12min timeout_in_minutes: 20 @@ -289,27 +319,56 @@ steps: - vllm/ - tests/v1 commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt # split the test to avoid interference - - pytest -v -s v1/core + - pytest -v -s -m 'not cpu_test' v1/core - pytest -v -s v1/executor - pytest -v -s v1/kv_offload - pytest -v -s v1/sample - pytest -v -s v1/logits_processors - pytest -v -s v1/worker - - pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode - - pytest -v -s v1/kv_connector/unit - - pytest -v -s v1/metrics - - pytest -v -s v1/test_kv_sharing.py - - pytest -v -s v1/test_metrics_reader.py + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics - pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_request.py - - pytest -v -s v1/test_serial_utils.py - - pytest -v -s v1/test_utils.py + - pytest -v -s v1/test_outputs.py # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine +- label: V1 Test attention (H100) # 10min + timeout_in_minutes: 30 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - pytest -v -s v1/attention + +- label: V1 Test attention (B200) # 10min + timeout_in_minutes: 30 + gpu: b200 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this + +- label: V1 Test others (CPU) # 5 mins + source_file_dependencies: + - vllm/ + - tests/v1 + no_gpu: true + commands: + # split the test to avoid interference + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + + - label: Examples Test # 30min timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] @@ -334,7 +393,8 @@ steps: - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - label: Platform Tests (CUDA) # 4min timeout_in_minutes: 15 @@ -369,7 +429,12 @@ steps: --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ - --ignore=lora/test_llm_with_multi_loras.py + --ignore=lora/test_llm_with_multi_loras.py \ + --ignore=lora/test_olmoe_tp.py \ + --ignore=lora/test_deepseekv2_tp.py \ + --ignore=lora/test_gptoss_tp.py \ + --ignore=lora/test_qwen3moe_tp.py + parallelism: 4 - label: PyTorch Compilation Unit Tests # 15min @@ -380,15 +445,18 @@ steps: - vllm/ - tests/compile commands: + - pytest -v -s compile/test_graph_partition.py + - pytest -v -s compile/test_config.py - pytest -v -s compile/test_pass_manager.py - pytest -v -s compile/test_fusion.py - pytest -v -s compile/test_fusion_attn.py + - pytest -v -s compile/test_functionalization.py - pytest -v -s compile/test_silu_mul_quant_fusion.py - - pytest -v -s compile/test_sequence_parallelism.py - - pytest -v -s compile/test_async_tp.py - pytest -v -s compile/test_fusion_all_reduce.py - pytest -v -s compile/test_decorator.py - pytest -v -s compile/test_noop_elimination.py + - pytest -v -s compile/test_aot_compile.py + - pytest -v -s compile/test_qk_norm_rope_fusion.py - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -399,17 +467,34 @@ steps: - tests/compile commands: - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/test_multimodal_compile.py - pytest -v -s compile/piecewise/ -- label: PyTorch Fullgraph Test # 20min - timeout_in_minutes: 30 +- label: PyTorch Fullgraph Test # 27min + timeout_in_minutes: 40 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_full_graph.py + # fp8 kv scales not supported on sm89, tested on Blackwell instead + - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + # Limit to no custom ops to reduce running time + # Wrap with quotes to escape yaml and avoid starting -k string with a - + - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + +- label: Cudagraph test + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental] + source_file_dependencies: + - tests/v1/cudagraph + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - vllm/compilation + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py - label: Kernels Core Operation Test # 48min timeout_in_minutes: 75 @@ -417,8 +502,9 @@ steps: source_file_dependencies: - csrc/ - tests/kernels/core + - tests/kernels/test_top_k_per_row.py commands: - - pytest -v -s kernels/core + - pytest -v -s kernels/core kernels/test_top_k_per_row.py - label: Kernels Attention Test %N # 23min timeout_in_minutes: 35 @@ -452,6 +538,8 @@ steps: - tests/kernels/moe - vllm/model_executor/layers/fused_moe/ - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config commands: - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 @@ -462,32 +550,25 @@ steps: source_file_dependencies: - csrc/mamba/ - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops commands: - pytest -v -s kernels/mamba -- label: Tensorizer Test # 14min - timeout_in_minutes: 25 - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/model_executor/model_loader - - tests/tensorizer_loader - - tests/entrypoints/openai/test_tensorizer_entrypoint.py - commands: - - apt-get update && apt-get install -y curl libsodium23 - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s tensorizer_loader - - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - -- label: Model Executor Test # 7min - timeout_in_minutes: 20 +- label: Model Executor Test # 23min + timeout_in_minutes: 35 + torch_nightly: true mirror_hardwares: [amdexperimental] source_file_dependencies: + - vllm/engine/arg_utils.py + - vllm/config/model.py - vllm/model_executor - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py commands: - apt-get update && apt-get install -y curl libsodium23 - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - label: Benchmarks # 11min timeout_in_minutes: 20 @@ -521,8 +602,9 @@ steps: # since torchao nightly is only compatible with torch nightly currently # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved - - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 - - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - label: LM Eval Small Models # 53min timeout_in_minutes: 75 @@ -530,6 +612,7 @@ steps: source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization + autorun_on_main: true commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 @@ -550,10 +633,17 @@ steps: source_file_dependencies: - vllm/ - tests/tool_use - - tests/mistral_tool_use commands: - - pytest -v -s tool_use - - pytest -v -s mistral_tool_use + - pytest -v -s -m 'not cpu_test' tool_use + +- label: OpenAI-Compatible Tool Use (CPU) # 5 mins + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/tool_use + no_gpu: true + commands: + - pytest -v -s -m 'cpu_test' tool_use ##### models test ##### @@ -593,13 +683,19 @@ steps: - vllm/ - tests/models/test_transformers.py - tests/models/test_registry.py + commands: + - pytest -v -s models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + timeout_in_minutes: 10 + torch_nightly: true + source_file_dependencies: + - vllm/ - tests/models/test_utils.py - tests/models/test_vision.py + no_gpu: true commands: - - pytest -v -s models/test_transformers.py \ - models/test_registry.py \ - models/test_utils.py \ - models/test_vision.py + - pytest -v -s models/test_utils.py models/test_vision.py - label: Language Models Tests (Standard) timeout_in_minutes: 25 @@ -658,8 +754,10 @@ steps: - vllm/ - tests/models/language/generation commands: - # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - label: Language Models Test (PPL) @@ -714,6 +812,16 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work +- label: Multi-Modal Accuracy Eval (Small Models) # 50min + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + - label: Multi-Modal Models Test (Extended) 1 mirror_hardwares: [amdexperimental] optional: true @@ -768,16 +876,17 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py - - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py + - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_transformers.py + # - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - python3 examples/offline_inference/basic/chat.py - - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper -- label: Blackwell Test # 38 min - timeout_in_minutes: 60 +- label: Blackwell Test # 21 min + timeout_in_minutes: 30 working_dir: "/vllm-workspace/" gpu: b200 # optional: true @@ -790,13 +899,16 @@ steps: - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/fusion.py - - vllm/compilation/fusion_attn.py + - vllm/v1/attention/backends/mla/cutlass_mla.py + - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/platforms/cuda.py + - vllm/attention/selector.py commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_attention_selector.py - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py @@ -808,19 +920,64 @@ steps: - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py - # Fusion - - pytest -v -s tests/compile/test_fusion_all_reduce.py - - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern + - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py + +- label: Blackwell Fusion and Compile Tests # 30 min + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + commands: + - nvidia-smi + - pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + # this runner has 2 GPUs available even though num_gpus=2 is not set + - pytest -v -s tests/compile/test_fusion_all_reduce.py + # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time + # Wrap with quotes to escape yaml + - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile -- label: GPT-OSS Eval (Blackwell) +- label: Blackwell Fusion E2E Tests # 30 min + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true + num_gpus: 2 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusions_e2e.py + - tests/compile/test_full_graph.py + commands: + - nvidia-smi + # Run all e2e fusion tests + - pytest -v -s tests/compile/test_fusions_e2e.py + +- label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" gpu: b200 - optional: true # disable while debugging + optional: true # run on nightlies source_file_dependencies: - tests/evals/gpt_oss - vllm/model_executor/models/gpt_oss.py @@ -828,7 +985,34 @@ steps: - vllm/v1/attention/backends/flashinfer.py commands: - uv pip install --system 'gpt-oss[eval]==0.0.5' - - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: Blackwell Quantized MoE Test + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - tests/quantization/test_blackwell_moe.py + - vllm/model_executor/models/deepseek_v2.py + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/models/llama4.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization/compressed_tensors + - vllm/model_executor/layers/quantization/modelopt.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py + +- label: Blackwell LM Eval Small Models + timeout_in_minutes: 120 + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 ##### 1 GPU test ##### ##### multi gpus test ##### @@ -889,19 +1073,21 @@ steps: - tests/compile/test_wrapper.py - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py - - tests/v1/test_async_llm_dp.py - - tests/v1/test_external_lb_dp.py + - tests/v1/distributed - tests/v1/entrypoints/openai/test_multi_api_servers.py - tests/v1/shutdown - tests/v1/worker/test_worker_memory_snapshot.py commands: - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - pytest -v -s distributed/test_sequence_parallel.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py @@ -945,6 +1131,11 @@ steps: - pytest -v -s plugins_tests/test_io_processor_plugins.py - pip uninstall prithvi_io_processor_plugin -y # end io_processor plugins test + # begin stat_logger plugins test + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + # end stat_logger plugins test # other tests continue here: - pytest -v -s plugins_tests/test_scheduler_plugins.py - pip install -e ./plugins/vllm_add_dummy_model @@ -984,6 +1175,8 @@ steps: - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + - pytest -v -s -x lora/test_gptoss_tp.py - label: Weight Loading Multiple GPU Test # 33min @@ -1010,6 +1203,17 @@ steps: commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt +- label: NixlConnector PD accuracy tests (Distributed) # 30min + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh + ##### multi gpus test ##### ##### A100 test ##### @@ -1040,15 +1244,34 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 +##### H100 test ##### +- label: LM Eval Large Models (H100) # optional + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + ##### H200 test ##### -- label: Distrubted Tests (H200) # optional +- label: Distributed Tests (H200) # optional gpu: h200 optional: true working_dir: "/vllm-workspace/" num_gpus: 2 commands: + - pytest -v -s tests/compile/test_async_tp.py + - pytest -v -s tests/compile/test_sequence_parallelism.py + - pytest -v -s tests/compile/test_fusion_all_reduce.py + - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'" + - pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### - label: Distributed Tests (B200) # optional @@ -1059,6 +1282,7 @@ steps: commands: - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py ##### RL Integration Tests ##### - label: Prime-RL Integration Test # 15min @@ -1071,3 +1295,21 @@ steps: - .buildkite/scripts/run-prime-rl-test.sh commands: - bash .buildkite/scripts/run-prime-rl-test.sh + +- label: DeepSeek V2-Lite Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 + +- label: Qwen3-30B-A3B-FP8-block Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020 diff --git a/.coveragerc b/.coveragerc index bc6342956109bae444611e7f26700bf88b711c33..b7a9fdb4e05a872cd7f3e2515b09ddbd6635388e 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,5 +1,10 @@ [run] -source = vllm +# Track the installed vllm package (this is what actually gets imported during tests) +# Use wildcard pattern to match the installed location +source = + vllm + */dist-packages/vllm + */site-packages/vllm omit = */tests/* */test_* @@ -12,6 +17,16 @@ omit = */benchmarks/* */docs/* +[paths] +# Map all possible vllm locations to a canonical "vllm" path +# This ensures coverage.combine properly merges data from different test runs +source = + vllm + /vllm-workspace/src/vllm + /vllm-workspace/vllm + */site-packages/vllm + */dist-packages/vllm + [report] exclude_lines = pragma: no cover diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000000000000000000000000000000000000..5a601d00cef8b9720cd6b078da6d1a14c5fff072 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,4 @@ +# Migrate from `yapf` & `isort` to `ruff` +d6953beb91da4e9c99be4c0a1304a2d24189535c +# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y` +8fcaaf6a165e661f63fc51be906bc05b0767332f diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 9d749fe8d3238b9c6530ebe5e85d3fa238fd368c..6e178bb690c56f442f875ae9b694e0441f84801f 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -3,17 +3,13 @@ # This lists cover the "core" components of vLLM that require careful review /vllm/attention @LucasWilkinson -/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill -/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn -/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn -/vllm/model_executor/layers/fused_moe @mgoin -/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche -/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 +/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill +/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn +/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety +/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/model_loader @22quinn -/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche -/vllm/v1/attention @LucasWilkinson -/vllm/v1/sample @22quinn @houseroad +/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee /vllm/reasoning @aarnphm @chaunceyjiang @@ -24,44 +20,57 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Any change to the VllmConfig changes can have a large user-facing impact, # so spam a lot of people -/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg +/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg +/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 # vLLM V1 -/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat -/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett -/vllm/v1/spec_decode @benchislett @luccafong -/vllm/v1/attention/backends/flashinfer.py @mgoin +/vllm/v1/attention @LucasWilkinson +/vllm/v1/attention/backends/mla @pavanimajety +/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety /vllm/v1/attention/backends/triton_attn.py @tdoublep -/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC +/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC +/vllm/v1/sample @22quinn @houseroad @njhill +/vllm/v1/spec_decode @benchislett @luccafong +/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett /vllm/v1/kv_cache_interface.py @heheda12345 /vllm/v1/offloading @ApostaC # Test ownership -/.buildkite/lm-eval-harness @mgoin @simon-mo +/.buildkite/lm-eval-harness @mgoin /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao -/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche +/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche /tests/evals @mgoin /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 /tests/models @DarkLight1337 @ywang96 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche -/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 +/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm -/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC +/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC /tests/weight_loading @mgoin @youkaichao @yewentao256 /tests/lora @jeejeelee /tests/models/language/generation/test_hybrid.py @tdoublep -/tests/v1/kv_connector/nixl_integration @NickLucche +/tests/v1/kv_connector/nixl_integration @NickLucche /tests/v1/kv_connector @ApostaC /tests/v1/offloading @ApostaC -# Transformers backend -/vllm/model_executor/models/transformers.py @hmellor +# Transformers modeling backend +/vllm/model_executor/models/transformers @hmellor /tests/models/test_transformers.py @hmellor +# Observability +/vllm/config/observability.py @markmc +/vllm/v1/metrics @markmc +/tests/v1/metrics @markmc +/vllm/tracing.py @markmc +/tests/v1/tracing/test_tracing.py @markmc +/vllm/config/kv_events.py @markmc +/vllm/distributed/kv_events.py @markmc +/tests/distributed/test_events.py @markmc + # Docs /docs/mkdocs @hmellor /docs/**/*.yml @hmellor @@ -106,11 +115,21 @@ mkdocs.yaml @hmellor /vllm/attention/ops/triton_unified_attention.py @tdoublep # ROCm related: specify owner with write access to notify AMD folks for careful code review -/docker/Dockerfile.rocm* @gshtras -/vllm/v1/attention/backends/rocm*.py @gshtras -/vllm/v1/attention/backends/mla/rocm*.py @gshtras -/vllm/attention/ops/rocm*.py @gshtras -/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras +/vllm/**/*rocm* @tjtanaa +/docker/Dockerfile.rocm* @gshtras @tjtanaa +/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa +/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa +/vllm/attention/ops/rocm*.py @gshtras @tjtanaa +/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa +/csrc/rocm @gshtras @tjtanaa +/requirements/*rocm* @tjtanaa +/tests/**/*rocm* @tjtanaa +/docs/**/*rocm* @tjtanaa +/vllm/**/*quark* @tjtanaa +/tests/**/*quark* @tjtanaa +/docs/**/*quark* @tjtanaa +/vllm/**/*aiter* @tjtanaa +/tests/**/*aiter* @tjtanaa # TPU /vllm/v1/worker/tpu* @NickLucche @@ -120,3 +139,16 @@ mkdocs.yaml @hmellor # KVConnector installation files /requirements/kv_connectors.txt @NickLucche + +# Pooling models +/examples/*/pooling/ @noooop +/tests/models/*/pooling* @noooop +/tests/entrypoints/pooling @noooop +/vllm/config/pooler.py @noooop +/vllm/pooling_params.py @noooop +/vllm/model_executor/layers/pooler.py @noooop + +# Security guide and policies +/docs/usage/security.md @russellb +/SECURITY.md @russellb +/docs/contributing/vulnerability_management.md @russellb diff --git a/.github/mergify.yml b/.github/mergify.yml index 75ee3e3c55b4634d8e38ff8e615614af3ddb911d..997a40e18e5885c282984d61997bf1dbef3397f6 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -2,6 +2,7 @@ pull_request_rules: - name: label-documentation description: Automatically apply documentation label conditions: + - label != stale - or: - files~=^[^/]+\.md$ - files~=^docs/ @@ -10,10 +11,13 @@ pull_request_rules: label: add: - documentation + comment: + message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/" - name: label-ci-build description: Automatically apply ci/build label conditions: + - label != stale - or: - files~=^\.github/ - files~=\.buildkite/ @@ -30,6 +34,7 @@ pull_request_rules: - name: label-deepseek description: Automatically apply deepseek label conditions: + - label != stale - or: - files~=^examples/.*deepseek.*\.py - files~=^tests/.*deepseek.*\.py @@ -46,6 +51,7 @@ pull_request_rules: - name: label-frontend description: Automatically apply frontend label conditions: + - label != stale - files~=^vllm/entrypoints/ actions: label: @@ -55,6 +61,7 @@ pull_request_rules: - name: label-llama description: Automatically apply llama label conditions: + - label != stale - or: - files~=^examples/.*llama.*\.py - files~=^tests/.*llama.*\.py @@ -70,6 +77,7 @@ pull_request_rules: - name: label-multi-modality description: Automatically apply multi-modality label conditions: + - label != stale - or: - files~=^vllm/multimodal/ - files~=^tests/multimodal/ @@ -83,6 +91,7 @@ pull_request_rules: - name: label-new-model description: Automatically apply new-model label conditions: + - label != stale - and: - files~=^vllm/model_executor/models/ - files=vllm/model_executor/models/registry.py @@ -94,11 +103,12 @@ pull_request_rules: - name: label-performance description: Automatically apply performance label conditions: + - label != stale - or: - files~=^benchmarks/ - files~=^vllm/benchmarks/ - files~=^tests/benchmarks/ - - files~=^\.buildkite/nightly-benchmarks/ + - files~=^\.buildkite/performance-benchmarks/ actions: label: add: @@ -107,6 +117,7 @@ pull_request_rules: - name: label-qwen description: Automatically apply qwen label conditions: + - label != stale - or: - files~=^examples/.*qwen.*\.py - files~=^tests/.*qwen.*\.py @@ -121,6 +132,7 @@ pull_request_rules: - name: label-gpt-oss description: Automatically apply gpt-oss label conditions: + - label != stale - or: - files~=^examples/.*gpt[-_]?oss.*\.py - files~=^tests/.*gpt[-_]?oss.*\.py @@ -139,9 +151,27 @@ pull_request_rules: add: - gpt-oss +- name: label-nvidia + description: Automatically apply nvidia label + conditions: + - label != stale + - or: + - files~=cuda + - files~=cutlass + - files~=flashinfer + - files~=trtllm + - title~=(?i)NVIDIA + - title~=(?i)CUDA + - title~=(?i)CUTLASS + actions: + label: + add: + - nvidia + - name: label-rocm description: Automatically apply rocm label conditions: + - label != stale - or: - files~=^csrc/rocm/ - files~=^docker/Dockerfile.rocm @@ -162,6 +192,7 @@ pull_request_rules: - name: label-structured-output description: Automatically apply structured-output label conditions: + - label != stale - or: - files~=^benchmarks/structured_schemas/ - files=benchmarks/benchmark_serving_structured_output.py @@ -181,6 +212,7 @@ pull_request_rules: - name: label-speculative-decoding description: Automatically apply speculative-decoding label conditions: + - label != stale - or: - files~=^vllm/v1/spec_decode/ - files~=^tests/v1/spec_decode/ @@ -196,6 +228,7 @@ pull_request_rules: - name: label-v1 description: Automatically apply v1 label conditions: + - label != stale - or: - files~=^vllm/v1/ - files~=^tests/v1/ @@ -208,6 +241,7 @@ pull_request_rules: description: Automatically apply tpu label # Keep this list in sync with `label-tpu-remove` conditions conditions: + - label != stale - or: - files~=tpu.py - files~=_tpu @@ -223,6 +257,7 @@ pull_request_rules: description: Automatically remove tpu label # Keep this list in sync with `label-tpu` conditions conditions: + - label != stale - and: - -files~=tpu.py - -files~=_tpu @@ -237,9 +272,9 @@ pull_request_rules: - name: label-tool-calling description: Automatically add tool-calling label conditions: + - label != stale - or: - files~=^tests/tool_use/ - - files~=^tests/mistral_tool_use/ - files~=^tests/entrypoints/openai/tool_parsers/ - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py - files~=^vllm/entrypoints/openai/tool_parsers/ @@ -256,8 +291,9 @@ pull_request_rules: - name: ping author on conflicts and add 'needs-rebase' label conditions: - - conflict - - -closed + - label != stale + - conflict + - -closed actions: label: add: @@ -271,10 +307,12 @@ pull_request_rules: - name: assign reviewer for tensorizer changes conditions: + - label != stale + - or: - files~=^vllm/model_executor/model_loader/tensorizer.py - files~=^vllm/model_executor/model_loader/tensorizer_loader.py - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py - - files~=^tests/tensorizer_loader/ + - files~=^tests/model_executor/model_loader/tensorizer_loader/ actions: assign: users: @@ -282,6 +320,7 @@ pull_request_rules: - name: assign reviewer for modelopt changes conditions: + - label != stale - or: - files~=^vllm/model_executor/layers/quantization/modelopt\.py$ - files~=^vllm/model_executor/layers/quantization/__init__\.py$ @@ -296,8 +335,8 @@ pull_request_rules: - name: remove 'needs-rebase' label when conflict is resolved conditions: - - -conflict - - -closed + - -conflict + - -closed actions: label: remove: @@ -306,6 +345,7 @@ pull_request_rules: - name: label-kv-connector description: Automatically apply kv-connector label conditions: + - label != stale - or: - files~=^examples/online_serving/disaggregated[^/]*/.* - files~=^examples/offline_inference/disaggregated[^/]*/.* diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml index c2b17abe811cdcd133159dd4349bb8ed9235aede..7d565ef9f2e459531a1a231db9d244dd4f4085be 100644 --- a/.github/workflows/issue_autolabel.yml +++ b/.github/workflows/issue_autolabel.yml @@ -13,6 +13,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Label issues based on keywords + id: label-step uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 with: script: | @@ -42,7 +43,6 @@ jobs: searchIn: "body" }, ], - // Substring search - matches anywhere in text (partial matches) substrings: [ { @@ -89,14 +89,12 @@ jobs: term: "hip_", searchIn: "both" }, - // ROCm tools and libraries { term: "hipify", searchIn: "both" }, ], - // Regex patterns - for complex pattern matching regexPatterns: [ { @@ -107,13 +105,17 @@ jobs: } ], }, + // Add more label configurations here as needed + // example: { + // keywords: [...], + // substrings: [...], + // regexPatterns: [...] + // }, }; - // Helper function to create regex based on search type function createSearchRegex(term, type) { // Escape special regex characters in the term const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); - switch (type) { case 'keyword': // Word boundary search - matches whole words only @@ -125,16 +127,13 @@ jobs: throw new Error(`Unknown search type: ${type}`); } } - // Helper function to find matching terms in text with line information function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') { const matches = []; const lines = text.split('\n'); - for (const termConfig of searchTerms) { let regex; let term, searchIn, pattern, description, flags; - // Handle different input formats (string or object) if (typeof termConfig === 'string') { term = termConfig; @@ -146,21 +145,17 @@ jobs: description = termConfig.description; flags = termConfig.flags; } - // Skip if this term shouldn't be searched in the current location if (searchIn !== 'both' && searchIn !== searchLocation) { continue; } - // Create appropriate regex if (searchType === 'regex') { regex = new RegExp(pattern, flags || "gi"); } else { regex = createSearchRegex(term, searchType); } - const termMatches = []; - // Check each line for matches lines.forEach((line, lineIndex) => { const lineMatches = line.match(regex); @@ -175,15 +170,14 @@ jobs: originalTerm: term || pattern, description: description, // Show context around the match in the line - context: line.length > 100 ? - line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), - line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' + context: line.length > 100 ? + line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), + line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' : line.trim() }); }); } }); - if (termMatches.length > 0) { matches.push({ term: term || (description || pattern), @@ -196,64 +190,48 @@ jobs: }); } } - return matches; } - // Helper function to check if label should be added async function processLabel(labelName, config) { const body = context.payload.issue.body || ""; const title = context.payload.issue.title || ""; - core.notice(`Processing label: ${labelName}`); core.notice(`Issue Title: "${title}"`); core.notice(`Issue Body length: ${body.length} characters`); - let shouldAddLabel = false; let allMatches = []; let reason = ''; - const keywords = config.keywords || []; const substrings = config.substrings || []; const regexPatterns = config.regexPatterns || []; - core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`); - // Search in title if (title.trim()) { core.notice(`Searching in title: "${title}"`); - const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title'); const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title'); const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title'); - allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches); } - // Search in body if (body.trim()) { core.notice(`Searching in body (${body.length} characters)`); - const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body'); const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body'); const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body'); - allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches); } - if (allMatches.length > 0) { core.notice(`Found ${allMatches.length} matching term(s):`); - for (const termMatch of allMatches) { const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body'; const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn; - if (termMatch.searchType === 'regex') { core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); } else { core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); } - // Show details for each match termMatch.matches.forEach((match, index) => { core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`); @@ -266,7 +244,6 @@ jobs: } }); } - shouldAddLabel = true; const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0); const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0); @@ -274,13 +251,10 @@ jobs: const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0); const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0); const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0); - reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`; } - core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`); core.notice(`Reason: ${reason || 'No matching terms found'}`); - if (shouldAddLabel) { const existingLabels = context.payload.issue.labels.map(l => l.name); if (!existingLabels.includes(labelName)) { @@ -296,14 +270,92 @@ jobs: core.notice(`Label "${labelName}" already present.`); return false; } - core.notice(`No matching terms found for label "${labelName}".`); return false; } - // Process all configured labels - const processLabels = Object.entries(labelConfig) - .map(([labelName, config]) => processLabel(labelName, config)); - const labelsAdded = await Promise.all(processLabels); - const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0); - core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`); \ No newline at end of file + const labelsAddedResults = await Promise.all( + Object.entries(labelConfig).map(([labelName, config]) => + processLabel(labelName, config).then(added => ({ labelName, added })) + ) + ); + + const numLabelsAdded = labelsAddedResults.filter(r => r.added).length; + core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`); + + // Return which labels were added for the next step + const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName); + core.setOutput('labels_added', JSON.stringify(addedLabels)); + return addedLabels; + + - name: CC users for labeled issues + if: steps.label-step.outputs.labels_added != '[]' + uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0 + with: + script: | + // Configuration: Map labels to GitHub users to CC + // You can add multiple users per label, and multiple label configurations + const ccConfig = { + rocm: { + users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3'] + message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions + }, + // Add more label -> user mappings here + // Example: + // cuda: { + // users: ['user1', 'user2'], + // message: 'CC {users} for CUDA-related issue' + // }, + // performance: { + // users: ['perfexpert'], + // message: 'CC {users} for performance issue' + // }, + }; + + const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}'); + core.notice(`Labels added: ${labelsAdded.join(', ')}`); + + // Get existing comments to check for already mentioned users + const comments = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + + const issueBody = context.payload.issue.body || ''; + const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n'); + + // Process each label that was added + for (const label of labelsAdded) { + if (ccConfig[label]) { + const config = ccConfig[label]; + const usersToMention = []; + + // Check which users haven't been mentioned yet + for (const user of config.users) { + const mentionPattern = new RegExp(`@${user}\\b`, 'i'); + if (!mentionPattern.test(allExistingText)) { + usersToMention.push(user); + } else { + core.notice(`@${user} already mentioned for label "${label}", skipping`); + } + } + + // Post comment if there are users to mention + if (usersToMention.length > 0) { + const mentions = usersToMention.map(u => `@${u}`).join(' '); + const message = config.message.replace('{users}', mentions); + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: message + }); + + core.notice(`CC comment added for label "${label}": ${mentions}`); + } else { + core.notice(`All users for label "${label}" already mentioned, skipping comment`); + } + } + } \ No newline at end of file diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml new file mode 100644 index 0000000000000000000000000000000000000000..42b05ecd5ac06ca8e55d36a87c2f628a1d6c3f73 --- /dev/null +++ b/.github/workflows/macos-smoke-test.yml @@ -0,0 +1,81 @@ +name: macOS Apple Silicon Smoke Test + +on: + push: + branches: + - main + workflow_dispatch: # Manual trigger + +jobs: + macos-m1-smoke-test: + runs-on: macos-latest + timeout-minutes: 20 + + steps: + - uses: actions/checkout@v4 + + - uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + cache-dependency-glob: | + requirements/**/*.txt + pyproject.toml + python-version: '3.12' + + - name: Create virtual environment + run: | + uv venv + echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH" + + - name: Install dependencies and build vLLM + run: | + uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match + uv pip install -e . + env: + CMAKE_BUILD_PARALLEL_LEVEL: 4 + + - name: Verify installation + run: | + python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" + python -c "import torch; print(f'PyTorch: {torch.__version__}')" + + - name: Smoke test vllm serve + timeout-minutes: 10 + run: | + # Start server in background + vllm serve Qwen/Qwen3-0.6B \ + --max-model-len=2048 \ + --load-format=dummy \ + --enforce-eager \ + --port 8000 & + + SERVER_PID=$! + + # Wait for server to start + for i in {1..30}; do + if curl -s http://localhost:8000/health > /dev/null; then + echo "Server started successfully" + break + fi + if [ "$i" -eq 30 ]; then + echo "Server failed to start" + kill "$SERVER_PID" + exit 1 + fi + sleep 2 + done + + # Test health endpoint + curl -f http://localhost:8000/health + + # Test completion + curl -f http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-0.6B", + "prompt": "Hello", + "max_tokens": 5 + }' + + # Cleanup + kill "$SERVER_PID" diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 82844810a633a18a73ddd2a73e12d0cd9073934e..dca3089f496c9639674ab3b87d24c93adfd05c13 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -13,7 +13,7 @@ jobs: actions: write runs-on: ubuntu-latest steps: - - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0 + - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0 with: # Increasing this value ensures that changes to this workflow # propagate to all issues and PRs in days rather than months diff --git a/.gitignore b/.gitignore index b1df673e83ca8c1f94fbeb1e1754cd541c39ea26..50070d7898fe6d40cb0e162892b70111291f5fa0 100644 --- a/.gitignore +++ b/.gitignore @@ -94,6 +94,9 @@ ipython_config.py # generated files **/generated/** +# uv +uv.lock + # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: @@ -218,3 +221,6 @@ csrc/moe/marlin_moe_wna16/kernel_* # Ignore ep_kernels_workspace folder ep_kernels_workspace/ + +# Allow tracked library source folders under submodules (e.g., benchmarks/lib) +!vllm/benchmarks/lib/ diff --git a/.markdownlint.yaml b/.markdownlint.yaml index c86fed9555d6235fa1e33652256f1609aec18d6a..937487f47364d04a8f8cb02a6bd0b8ac467362ec 100644 --- a/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -3,11 +3,9 @@ MD007: MD013: false MD024: siblings_only: true +MD031: + list_items: false MD033: false -MD042: false -MD045: false MD046: false -MD051: false MD052: false -MD053: false MD059: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8ca414ee4269b1e9fcbf38e8505001ea2edfa71e..e034f75a9d3224b7f50030128912d4b18e7cf50e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,30 +6,19 @@ default_stages: - manual # Run in CI exclude: 'vllm/third_party/.*' repos: -- repo: https://github.com/google/yapf - rev: v0.43.0 - hooks: - - id: yapf - args: [--in-place, --verbose] - # Keep the same list from yapfignore here to avoid yapf failing without any inputs - exclude: '(.buildkite|benchmarks|build|examples)/.*' - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.7 + rev: v0.14.0 hooks: - - id: ruff + - id: ruff-check args: [--output-format, github, --fix] - id: ruff-format - files: ^(.buildkite|benchmarks|examples)/.* - repo: https://github.com/crate-ci/typos - rev: v1.35.5 + rev: v1.38.1 hooks: - id: typos -- repo: https://github.com/PyCQA/isort - rev: 6.0.1 - hooks: - - id: isort + args: [--force-exclude] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v20.1.3 + rev: v21.1.2 hooks: - id: clang-format exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' @@ -46,32 +35,27 @@ repos: hooks: - id: actionlint - repo: https://github.com/astral-sh/uv-pre-commit - rev: 0.6.17 + rev: 0.9.1 hooks: - id: pip-compile - args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28] + args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"] files: ^requirements/test\.(in|txt)$ - repo: local hooks: - id: format-torch-nightly-test name: reformat nightly_torch_test.txt to be in sync with test.in language: python - entry: python tools/generate_nightly_torch_test.py + entry: python tools/pre_commit/generate_nightly_torch_test.py files: ^requirements/test\.(in|txt)$ - id: mypy-local - name: Run mypy for local Python installation - entry: python tools/pre_commit/mypy.py 0 "local" + name: Run mypy locally for lowest supported Python version + entry: python tools/pre_commit/mypy.py 0 "3.10" stages: [pre-commit] # Don't run in CI <<: &mypy_common language: python types_or: [python, pyi] require_serial: true additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic] - - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - name: Run mypy for Python 3.9 - entry: python tools/pre_commit/mypy.py 1 "3.9" - <<: *mypy_common - stages: [manual] # Only run in CI - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.10 entry: python tools/pre_commit/mypy.py 1 "3.10" @@ -87,14 +71,19 @@ repos: entry: python tools/pre_commit/mypy.py 1 "3.12" <<: *mypy_common stages: [manual] # Only run in CI + - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.13 + entry: python tools/pre_commit/mypy.py 1 "3.13" + <<: *mypy_common + stages: [manual] # Only run in CI - id: shellcheck name: Lint shell scripts - entry: tools/shellcheck.sh + entry: tools/pre_commit/shellcheck.sh language: script types: [shell] - id: png-lint name: Lint PNG exports from excalidraw - entry: tools/png-lint.sh + entry: tools/pre_commit/png-lint.sh language: script types: [png] - id: signoff-commit @@ -111,12 +100,12 @@ repos: stages: [commit-msg] - id: check-spdx-header name: Check SPDX headers - entry: python tools/check_spdx_header.py + entry: python tools/pre_commit/check_spdx_header.py language: python types: [python] - id: check-root-lazy-imports name: Check root lazy imports - entry: python tools/check_init_lazy_imports.py + entry: python tools/pre_commit/check_init_lazy_imports.py language: python types: [python] - id: check-filenames @@ -130,11 +119,11 @@ repos: pass_filenames: false - id: update-dockerfile-graph name: Update Dockerfile dependency graph - entry: tools/update-dockerfile-graph.sh + entry: tools/pre_commit/update-dockerfile-graph.sh language: script - id: enforce-import-regex-instead-of-re name: Enforce import regex as re - entry: python tools/enforce_regex_import.py + entry: python tools/pre_commit/enforce_regex_import.py language: python types: [python] pass_filenames: false @@ -142,7 +131,7 @@ repos: # forbid directly import triton - id: forbid-direct-triton-import name: "Forbid direct 'import triton'" - entry: python tools/check_triton_import.py + entry: python tools/pre_commit/check_triton_import.py language: python types: [python] pass_filenames: false @@ -155,7 +144,7 @@ repos: additional_dependencies: [regex] - id: validate-config name: Validate configuration has default values and that each field has a docstring - entry: python tools/validate_config.py + entry: python tools/pre_commit/validate_config.py language: python additional_dependencies: [regex] # Keep `suggestion` last diff --git a/CMakeLists.txt b/CMakeLists.txt index 138537ba37fb02156571936d170f8582992a4d54..4bdaf4181e7c1a4d37c13cd03ad4616966b18b9e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,11 +34,18 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. # -set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13") +set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13") # Supported AMD GPU architectures. -set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx928;gfx936") +set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151;gfx928;gfx936") +# ROCm installation prefix. Default to /opt/rocm but allow override via +# -DROCM_PATH=/your/rocm/path when invoking cmake. +if(NOT DEFINED ROCM_PATH) + set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm installation prefix") +else() + set(ROCM_PATH ${ROCM_PATH} CACHE PATH "ROCm installation prefix" FORCE) +endif() # # Supported/expected torch versions for CUDA/ROCm. # @@ -49,8 +56,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from docker/Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0") -set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0") +set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0") # # Try to find python package with an executable that exactly matches @@ -86,6 +93,9 @@ find_package(Torch REQUIRED) # Supported NVIDIA architectures. # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND + CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) + set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0") +elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") else() @@ -175,6 +185,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") endif() +# +# Set compression mode for CUDA >=13.x. +# +if(VLLM_GPU_LANG STREQUAL "CUDA" AND + DEFINED CMAKE_CUDA_COMPILER_VERSION AND + CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) + list(APPEND VLLM_GPU_FLAGS "--compress-mode=size") +endif() + # # Set CUDA include flags for CXX compiler. # @@ -225,11 +244,28 @@ set_gencode_flags_for_srcs( SRCS "${VLLM_CUMEM_EXT_SRC}" CUDA_ARCHS "${CUDA_ARCHS}") -if(VLLM_GPU_LANG STREQUAL "CUDA") +if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") message(STATUS "Enabling cumem allocator extension.") - # link against cuda driver library - list(APPEND CUMEM_LIBS CUDA::cuda_driver) - define_gpu_extension_target( + if(VLLM_GPU_LANG STREQUAL "CUDA") + # link against cuda driver library + list(APPEND CUMEM_LIBS CUDA::cuda_driver) + else() + # link against rocm driver library. Prefer an absolute path to + # libamdhip64.so inside ${ROCM_PATH}/lib if available, otherwise fall + # back to linking by name "amdhip64". + find_library(AMDHIP64_LIB + NAMES amdhip64 libamdhip64.so + PATHS ${ROCM_PATH}/lib + NO_DEFAULT_PATH) + if(AMDHIP64_LIB) + message(STATUS "Found libamdhip64 at ${AMDHIP64_LIB}") + list(APPEND CUMEM_LIBS ${AMDHIP64_LIB}) + else() + message(WARNING "libamdhip64 not found in ${ROCM_PATH}/lib; falling back to linking 'amdhip64' by name") + list(APPEND CUMEM_LIBS amdhip64) + endif() + endif() + define_extension_target( cumem_allocator DESTINATION vllm LANGUAGE CXX @@ -253,13 +289,13 @@ set(VLLM_EXT_SRC "csrc/pos_encoding_kernels.cu" "csrc/activation_kernels.cu" "csrc/layernorm_kernels.cu" - "csrc/opt/transpose_kernels.cu" + "csrc/fused_qknorm_rope_kernel.cu" # "csrc/layernorm_quant_kernels.cu" "csrc/sampler.cu" "csrc/cuda_view.cu" "csrc/quantization/gptq/q_gemm.cu" - "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" - # "csrc/quantization/fp8/common.cu" + "csrc/quantization/w8a8/int8/scaled_quant.cu" + # "csrc/quantization/w8a8/fp8/common.cu" "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" "csrc/quantization/gguf/gguf_kernel.cu" # "csrc/quantization/activation_kernels.cu" @@ -271,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. - set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use") + set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) @@ -303,13 +339,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC "csrc/quantization/awq/gemm_kernels.cu" "csrc/permute_cols.cu" - "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" + "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu" "csrc/quantization/fp4/nvfp4_quant_entry.cu" "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu" - "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu" "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" "csrc/cutlass_extensions/common.cpp" - "csrc/quantization/fp8/per_token_group_quant.cu") + "csrc/quantization/w8a8/fp8/per_token_group_quant.cu" + "csrc/quantization/w8a8/int8/per_token_group_quant.cu") set_gencode_flags_for_srcs( SRCS "${VLLM_EXT_SRC}" @@ -319,7 +355,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. # 9.0 for latest bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}") if (MARLIN_ARCHS) # @@ -413,11 +449,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS) set(SRCS - "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu") + "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_ARCHS}") @@ -441,12 +477,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require # CUDA 12.8 or later - cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS - "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu" + "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu" ) set_gencode_flags_for_srcs( SRCS "${SRCS}" @@ -471,12 +511,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x) # require CUDA 12.8 or later - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) set(SRCS - "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" - "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu" + "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu" + "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu" ) set_gencode_flags_for_srcs( SRCS "${SRCS}" @@ -507,7 +551,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) if (SCALED_MM_2X_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu") + set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_2X_ARCHS}") @@ -551,7 +595,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require # CUDA 12.8 or later - cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" @@ -570,7 +618,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() # FP4 Archs and flags - cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" @@ -592,7 +644,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() # CUTLASS MLA Archs and flags - cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS) set(SRCS "csrc/attention/mla/sm100_cutlass_mla_kernel.cu") @@ -618,7 +674,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # if it's possible to compile MoE kernels that use its output. cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu") + set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_ARCHS}") @@ -636,9 +692,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() endif() - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu") + set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_ARCHS}") @@ -657,9 +717,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() # moe_data.cu is used by all CUTLASS MoE kernels. - cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu") + set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}") @@ -676,9 +740,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() endif() - cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") + else() + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") + endif() if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) - set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu") + set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${SCALED_MM_ARCHS}") @@ -793,7 +861,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() # Hadacore kernels - cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}") if(HADACORE_ARCHS) set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu") set_gencode_flags_for_srcs( @@ -815,7 +883,7 @@ if (VLLM_GPU_LANG STREQUAL "HIP") endif() message(STATUS "Enabling C extension.") -define_gpu_extension_target( +define_extension_target( _C DESTINATION vllm LANGUAGE ${VLLM_GPU_LANG} @@ -840,6 +908,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp" "csrc/moe/moe_align_sum_kernels.cu" + "csrc/moe/moe_lora_align_sum_kernels.cu" "csrc/moe/topk_softmax_kernels.cu") if(VLLM_GPU_LANG STREQUAL "CUDA") @@ -870,7 +939,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") # 9.0 for latest bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}") if (MARLIN_MOE_ARCHS) # @@ -929,7 +998,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() message(STATUS "Enabling moe extension.") -define_gpu_extension_target( +define_extension_target( _moe_C DESTINATION vllm LANGUAGE ${VLLM_GPU_LANG} @@ -951,7 +1020,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP") "csrc/rocm/skinny_gemms.cu" "csrc/rocm/attention.cu") - define_gpu_extension_target( + define_extension_target( _rocm_C DESTINATION vllm LANGUAGE ${VLLM_GPU_LANG} @@ -966,6 +1035,7 @@ endif() # For CUDA we also build and ship some external projects. if (VLLM_GPU_LANG STREQUAL "CUDA") include(cmake/external_projects/flashmla.cmake) + include(cmake/external_projects/qutlass.cmake) # vllm-flash-attn should be last as it overwrites some CMake functions include(cmake/external_projects/vllm_flash_attn.cmake) diff --git a/README_ORIGIN.md b/README_ORIGIN.md index 0c6e5aa6b31d215f413a241b37d8f879c17fe979..fc339ceda8839fa6344795f7b3886fcfe32d2d95 100644 --- a/README_ORIGIN.md +++ b/README_ORIGIN.md @@ -21,6 +21,10 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio *Latest News* 🔥 +- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI) +- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link). +- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6). +- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing). - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA). - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing). - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH). @@ -81,7 +85,7 @@ vLLM is flexible and easy to use with: - Tensor, pipeline, data and expert parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend. +- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend. - Prefix caching support - Multi-LoRA support @@ -148,6 +152,7 @@ Compute Resources: - Trainy - UC Berkeley - UC San Diego +- Volcengine Slack Sponsor: Anyscale @@ -178,4 +183,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ## Media Kit -- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit) +- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit) \ No newline at end of file diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index b333ba9cd8e99b85cdcaf5ff7b4bb2d3c0f1e6fd..56b721cbb402192153c3c0c8bafee725a86cdc83 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -74,7 +74,7 @@ start_server() { local vllm_log=$4 local profile_dir=$5 - pkill -if vllm + pkill -if "vllm serve" || true # Define the common arguments as a bash array. # Each argument and its value are separate elements. @@ -96,11 +96,11 @@ start_server() { # This correctly passes each element as a separate argument. if [[ -n "$profile_dir" ]]; then # Start server with profiling enabled - VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ + VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & else # Start server without profiling - VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \ + VLLM_SERVER_DEV_MODE=1 \ vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & fi local server_pid=$! @@ -139,7 +139,7 @@ run_benchmark() { echo "vllm_log: $vllm_log" echo rm -f $vllm_log - pkill -if vllm + pkill -if "vllm serve" || true echo "starting server..." # Call start_server without a profile_dir to avoid profiling overhead @@ -232,7 +232,7 @@ run_benchmark() { echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" - pkill -if vllm + pkill -if "vllm serve" || true sleep 10 echo "====================" return 0 @@ -308,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then else echo "No configuration met the latency requirements. Skipping final profiling run." fi -pkill -if vllm +pkill -if "vllm serve" || true echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT" diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index ba7c733be0b25bdef90e4fba703ecb4640fe1787..4021fede72153ee67b5570529c07e1fbdc750d41 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -8,7 +8,6 @@ import sys import time import traceback from dataclasses import dataclass, field -from typing import Optional, Union import aiohttp import huggingface_hub.constants @@ -28,13 +27,13 @@ class RequestFuncInput: prompt_len: int output_len: int model: str - model_name: Optional[str] = None - logprobs: Optional[int] = None - extra_body: Optional[dict] = None - multi_modal_content: Optional[dict | list[dict]] = None + model_name: str | None = None + logprobs: int | None = None + extra_body: dict | None = None + multi_modal_content: dict | list[dict] | None = None ignore_eos: bool = False - language: Optional[str] = None - request_id: Optional[str] = None + language: str | None = None + request_id: str | None = None @dataclass @@ -52,7 +51,7 @@ class RequestFuncOutput: async def async_request_tgi( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") @@ -133,7 +132,7 @@ async def async_request_tgi( async def async_request_trt_llm( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") @@ -204,7 +203,7 @@ async def async_request_trt_llm( async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("completions", "profile")), ( @@ -267,7 +266,7 @@ async def async_request_deepspeed_mii( async def async_request_openai_completions( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("completions", "profile")), ( @@ -367,7 +366,7 @@ async def async_request_openai_completions( async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("chat/completions", "profile")), ( @@ -476,7 +475,7 @@ async def async_request_openai_chat_completions( async def async_request_openai_audio( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: # Lazy import without PlaceholderModule to avoid vllm dep. import soundfile @@ -610,7 +609,7 @@ def get_tokenizer( tokenizer_mode: str = "auto", trust_remote_code: bool = False, **kwargs, -) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: +) -> PreTrainedTokenizer | PreTrainedTokenizerFast: if pretrained_model_name_or_path is not None and not os.path.exists( pretrained_model_name_or_path ): diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py new file mode 100755 index 0000000000000000000000000000000000000000..b5c16c42de467abb26927a017003fc36c5c33e71 --- /dev/null +++ b/benchmarks/benchmark_batch_invariance.py @@ -0,0 +1,380 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode. + +This benchmark runs the same workload twice: +1. With VLLM_BATCH_INVARIANT=0 (baseline) +2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode) + +And reports the timing and throughput metrics for comparison. + +Environment variables: + VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B") + VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek) + VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128) + VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5) + VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024) + VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048) + VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128) + VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0) + VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4) + VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120) + VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN) + +Example usage: + # Benchmark qwen3 (default) + python benchmarks/benchmark_batch_invariance.py + + # Benchmark deepseek with 8 GPUs + VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\ + python benchmarks/benchmark_batch_invariance.py + + # Quick test with fewer trials + VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\ + python benchmarks/benchmark_batch_invariance.py +""" + +import contextlib +import os +import random +import time + +from vllm import LLM, SamplingParams +from vllm.platforms import current_platform + + +def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str: + """Generate a random prompt for benchmarking.""" + prompt_templates = [ + "Question: What is the capital of France?\nAnswer: The capital of France is", + "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which", + "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is", + "Once upon a time in a distant galaxy, there lived", + "The old man walked slowly down the street, remembering", + "In the year 2157, humanity finally discovered", + "To implement a binary search tree in Python, first we need to", + "The algorithm works by iterating through the array and", + "Here's how to optimize database queries using indexing:", + "The Renaissance was a period in European history that", + "Climate change is caused by several factors including", + "The human brain contains approximately 86 billion neurons which", + "I've been thinking about getting a new laptop because", + "Yesterday I went to the store and bought", + "My favorite thing about summer is definitely", + ] + + base_prompt = random.choice(prompt_templates) + + if max_words < min_words: + max_words = min_words + target_words = random.randint(min_words, max_words) + + if target_words > 50: + padding_text = ( + " This is an interesting topic that deserves more explanation. " + * (target_words // 50) + ) + base_prompt = base_prompt + padding_text + + return base_prompt + + +def run_benchmark_with_batch_invariant( + model: str, + tp_size: int, + max_batch_size: int, + num_trials: int, + min_prompt: int, + max_prompt: int, + max_tokens: int, + temperature: float, + gpu_mem_util: float, + max_model_len: int, + backend: str, + batch_invariant: bool, + seed: int = 12345, +) -> dict: + """ + Run the benchmark with the specified configuration. + + Returns a dict with timing and throughput metrics. + """ + random.seed(seed) + + # Set environment variables + os.environ["VLLM_ATTENTION_BACKEND"] = backend + if batch_invariant: + os.environ["VLLM_BATCH_INVARIANT"] = "1" + else: + os.environ["VLLM_BATCH_INVARIANT"] = "0" + + print(f"\n{'=' * 80}") + print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}") + print(f" Model: {model}") + print(f" TP Size: {tp_size}") + print(f" Backend: {backend}") + print(f" Max Batch Size: {max_batch_size}") + print(f" Trials: {num_trials}") + print(f" Max Tokens: {max_tokens}") + print(f"{'=' * 80}\n") + + sampling = SamplingParams( + temperature=temperature, + top_p=0.95, + max_tokens=max_tokens, + seed=20240919, + ) + + needle_prompt = "There once was a " + + llm = None + try: + # Create LLM engine + start_init = time.perf_counter() + llm = LLM( + model=model, + max_num_seqs=max_batch_size, + gpu_memory_utilization=gpu_mem_util, + max_model_len=max_model_len, + dtype="bfloat16", + tensor_parallel_size=tp_size, + enable_prefix_caching=False, + ) + init_time = time.perf_counter() - start_init + print(f"Engine initialization time: {init_time:.2f}s\n") + + # Generate baseline + print("Generating baseline (warmup)...") + baseline_out = llm.generate([needle_prompt], sampling) + assert len(baseline_out) == 1 + baseline_text = baseline_out[0].outputs[0].text + print(f"Baseline output: '{baseline_text[:50]}...'\n") + + # Run trials and measure timing + trial_times: list[float] = [] + total_tokens = 0 + total_prompts = 0 + + for trial in range(num_trials): + # Create a batch + prompts: list[str] = [] + batch_size = random.randint(max_batch_size // 2, max_batch_size) + needle_pos = random.randint(0, batch_size - 1) + for i in range(batch_size): + if i == needle_pos: + prompts.append(needle_prompt) + else: + prompts.append(_random_prompt(min_prompt, max_prompt)) + + # Measure time for this trial + start_time = time.perf_counter() + outputs = llm.generate(prompts, sampling) + trial_time = time.perf_counter() - start_time + + trial_times.append(trial_time) + total_prompts += len(prompts) + + # Count tokens + for output in outputs: + if output.outputs: + total_tokens += len(output.outputs[0].token_ids) + + print( + f"Trial {trial + 1}/{num_trials}: " + f"batch_size={batch_size}, " + f"time={trial_time:.2f}s" + ) + + # Verify needle output still matches + needle_output = outputs[needle_pos] + assert needle_output.prompt == needle_prompt + + # Compute statistics + avg_time = sum(trial_times) / len(trial_times) + min_time = min(trial_times) + max_time = max(trial_times) + throughput = total_tokens / sum(trial_times) + prompts_per_sec = total_prompts / sum(trial_times) + + print(f"\n{'=' * 80}") + print("RESULTS:") + print(f" Average time per trial: {avg_time:.2f}s") + print(f" Min time: {min_time:.2f}s") + print(f" Max time: {max_time:.2f}s") + print(f" Total tokens generated: {total_tokens}") + print(f" Total prompts processed: {total_prompts}") + print(f" Throughput: {throughput:.2f} tokens/s") + print(f" Prompts/s: {prompts_per_sec:.2f}") + print(f"{'=' * 80}\n") + + return { + "init_time": init_time, + "avg_time": avg_time, + "min_time": min_time, + "max_time": max_time, + "total_tokens": total_tokens, + "total_prompts": total_prompts, + "throughput": throughput, + "prompts_per_sec": prompts_per_sec, + "trial_times": trial_times, + } + + finally: + # Cleanup + if llm is not None: + with contextlib.suppress(Exception): + llm.shutdown() + + +def main(): + # Check platform support + if not (current_platform.is_cuda() and current_platform.has_device_capability(90)): + print("ERROR: Requires CUDA and >= Hopper (SM90)") + print(f"Current platform: {current_platform.device_type}") + if current_platform.is_cuda(): + print(f"Device capability: {current_platform.get_device_capability()}") + return 1 + + # Read configuration from environment + model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B") + tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1")) + max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128")) + num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5")) + min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024")) + max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048")) + max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128")) + temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0")) + gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4")) + max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120")) + backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN") + + print("\n" + "=" * 80) + print("VLLM BATCH INVARIANCE BENCHMARK") + print("=" * 80) + print("\nConfiguration:") + print(f" Model: {model}") + print(f" Tensor Parallel Size: {tp_size}") + print(f" Attention Backend: {backend}") + print(f" Max Batch Size: {max_batch_size}") + print(f" Number of Trials: {num_trials}") + print(f" Prompt Length Range: {min_prompt}-{max_prompt} words") + print(f" Max Tokens to Generate: {max_tokens}") + print(f" Temperature: {temperature}") + print(f" GPU Memory Utilization: {gpu_mem_util}") + print(f" Max Model Length: {max_model_len}") + print("=" * 80) + + # Run benchmark WITHOUT batch invariance (baseline) + print("\n" + "=" * 80) + print("PHASE 1: Running WITHOUT batch invariance (baseline)") + print("=" * 80) + baseline_results = run_benchmark_with_batch_invariant( + model=model, + tp_size=tp_size, + max_batch_size=max_batch_size, + num_trials=num_trials, + min_prompt=min_prompt, + max_prompt=max_prompt, + max_tokens=max_tokens, + temperature=temperature, + gpu_mem_util=gpu_mem_util, + max_model_len=max_model_len, + backend=backend, + batch_invariant=False, + ) + + # Run benchmark WITH batch invariance + print("\n" + "=" * 80) + print("PHASE 2: Running WITH batch invariance") + print("=" * 80) + batch_inv_results = run_benchmark_with_batch_invariant( + model=model, + tp_size=tp_size, + max_batch_size=max_batch_size, + num_trials=num_trials, + min_prompt=min_prompt, + max_prompt=max_prompt, + max_tokens=max_tokens, + temperature=temperature, + gpu_mem_util=gpu_mem_util, + max_model_len=max_model_len, + backend=backend, + batch_invariant=True, + ) + + # Compare results + print("\n" + "=" * 80) + print("COMPARISON: Batch Invariance vs Baseline") + print("=" * 80) + + init_overhead_pct = ( + (batch_inv_results["init_time"] - baseline_results["init_time"]) + / baseline_results["init_time"] + * 100 + ) + time_overhead_pct = ( + (batch_inv_results["avg_time"] - baseline_results["avg_time"]) + / baseline_results["avg_time"] + * 100 + ) + throughput_change_pct = ( + (batch_inv_results["throughput"] - baseline_results["throughput"]) + / baseline_results["throughput"] + * 100 + ) + + print("\nInitialization Time:") + print(f" Baseline: {baseline_results['init_time']:.2f}s") + print(f" Batch Invariant: {batch_inv_results['init_time']:.2f}s") + print(f" Overhead: {init_overhead_pct:+.2f}%") + + print("\nAverage Trial Time:") + print(f" Baseline: {baseline_results['avg_time']:.2f}s") + print(f" Batch Invariant: {batch_inv_results['avg_time']:.2f}s") + print(f" Overhead: {time_overhead_pct:+.2f}%") + + print("\nThroughput (tokens/s):") + print(f" Baseline: {baseline_results['throughput']:.2f}") + print(f" Batch Invariant: {batch_inv_results['throughput']:.2f}") + print(f" Change: {throughput_change_pct:+.2f}%") + + print("\nPrompts/s:") + print(f" Baseline: {baseline_results['prompts_per_sec']:.2f}") + print(f" Batch Invariant: {batch_inv_results['prompts_per_sec']:.2f}") + + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + if time_overhead_pct > 0: + print( + f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% " + "overhead" + ) + else: + print( + f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% " + "faster (unexpected!)" + ) + + if abs(throughput_change_pct) < 1.0: + print("Throughput difference is negligible (< 1%)") + elif throughput_change_pct < 0: + print( + f"Throughput decreased by {-throughput_change_pct:.1f}% " + "with batch invariance" + ) + else: + print( + f"Throughput increased by {throughput_change_pct:.1f}% " + "with batch invariance (unexpected!)" + ) + + print("=" * 80 + "\n") + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py index eae8d9927ea391d496e2c5582ea72e67391c93d3..20cd26bdddf513c21f31853380d62583ac51980d 100644 --- a/benchmarks/benchmark_block_pool.py +++ b/benchmarks/benchmark_block_pool.py @@ -2,10 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import gc +from benchmark_utils import TimeCollector from tabulate import tabulate -from benchmark_utils import TimeCollector -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.core.block_pool import BlockPool diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 6e0f3b51c9d284d22dd1acdc1b794b9a5c0f31b1..f64fd09bab9fa7d57dfe5a1312bdcc6eb0f9292f 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -46,7 +46,7 @@ import time from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser def test_long_document_qa(llm=None, sampling_params=None, prompts=None): diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index d4b83edbd940edfd561df26fe04adcc32d0dca0d..dedb564fffac8eee211577c10f138247629177cb 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -5,9 +5,9 @@ import time from unittest import mock import numpy as np +from benchmark_utils import TimeCollector from tabulate import tabulate -from benchmark_utils import TimeCollector from vllm.config import ( CacheConfig, DeviceConfig, @@ -19,7 +19,7 @@ from vllm.config import ( VllmConfig, ) from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner @@ -164,7 +164,7 @@ def invoke_main() -> None: ) parser.add_argument( "--batched", action="store_true", help="consider time to prepare batch" - ) # noqa: E501 + ) parser.add_argument( "--num-iteration", type=int, diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index b5e2613de1cd4a0a2557cf09baa458aa15a6049c..28fc383a318dd16c54dcbc6cca0acdb5ac0b7876 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -32,13 +32,12 @@ import dataclasses import json import random import time -from typing import Optional from transformers import PreTrainedTokenizerBase from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser try: from vllm.transformers_utils.tokenizer import get_tokenizer @@ -70,7 +69,7 @@ def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]: # Remove the special tokens. return random.choices( - [v for k, v in vocab.items() if k not in all_special_ids], + [v for v in vocab.values() if v not in all_special_ids], k=length, ) @@ -80,7 +79,7 @@ def sample_requests_from_dataset( num_requests: int, tokenizer: PreTrainedTokenizerBase, input_length_range: tuple[int, int], - fixed_output_len: Optional[int], + fixed_output_len: int | None, ) -> list[Request]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -128,7 +127,7 @@ def sample_requests_from_random( num_requests: int, tokenizer: PreTrainedTokenizerBase, input_length_range: tuple[int, int], - fixed_output_len: Optional[int], + fixed_output_len: int | None, prefix_len: int, ) -> list[Request]: requests = [] diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index bb453791c1862bdd0883f0a47af6fddeef764f8f..a35db0063b0ae245f2022af198f80c673c700512 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -7,12 +7,11 @@ import dataclasses import json import random import time -from typing import Optional from transformers import AutoTokenizer, PreTrainedTokenizerBase from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser # Select a equi-probable random priority @@ -24,7 +23,7 @@ def sample_requests( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int], + fixed_output_len: int | None, ) -> list[tuple[str, int, int, int]]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index a0350625491f49a5cf2ec6c6e5f128944de504fe..55001cf3722a0b4d4d835f4632d07687c4c3f162 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -31,20 +31,19 @@ import time import uuid import warnings from collections.abc import AsyncGenerator +from contextlib import nullcontext from dataclasses import dataclass -from typing import Optional import datasets import numpy as np import pandas as pd -from tqdm.asyncio import tqdm -from transformers import PreTrainedTokenizerBase - from backend_request_func import ( ASYNC_REQUEST_FUNCS, RequestFuncInput, RequestFuncOutput, ) +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase try: from vllm.transformers_utils.tokenizer import get_tokenizer @@ -52,7 +51,7 @@ except ImportError: from backend_request_func import get_tokenizer try: - from vllm.utils import FlexibleArgumentParser + from vllm.utils.argparse_utils import FlexibleArgumentParser except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser @@ -317,7 +316,7 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: list[str], selected_percentiles: list[float], - goodput_config_dict: Optional[dict[str, float]] = None, + goodput_config_dict: dict[str, float] | None = None, ) -> tuple[BenchmarkMetrics, list[int]]: actual_output_lens: list[int] = [] total_input = 0 @@ -437,9 +436,9 @@ async def benchmark( selected_percentile_metrics: list[str], selected_percentiles: list[str], ignore_eos: bool, - max_concurrency: Optional[int], + max_concurrency: int | None, structured_output_ratio: float, - goodput_config_dict: Optional[dict[str, float]] = None, + goodput_config_dict: dict[str, float] | None = None, ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -503,15 +502,9 @@ async def benchmark( pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - # This can be used once the minimum Python version is 3.10 or higher, - # and it will simplify the code in limited_request_func. - # semaphore = (asyncio.Semaphore(max_concurrency) - # if max_concurrency else contextlib.nullcontext()) - semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None + semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext() async def limited_request_func(request_func_input, pbar): - if semaphore is None: - return await request_func(request_func_input=request_func_input, pbar=pbar) async with semaphore: return await request_func(request_func_input=request_func_input, pbar=pbar) @@ -910,13 +903,13 @@ def create_argument_parser(): parser.add_argument( "--tokenizer", type=str, - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + help="Name or path of the tokenizer, if not using the default tokenizer.", ) parser.add_argument( "--tokenizer-mode", type=str, default="auto", - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + help="Name or path of the tokenizer, if not using the default tokenizer.", ) parser.add_argument( "--num-prompts", diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 98624abdf49fb0dbf75a5c3f58a5fe95469e9b51..f0d661f9d53498f5067bfb0ef7d19d52af84699d 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -6,7 +6,7 @@ import math import os import time from types import TracebackType -from typing import Any, Optional, Union +from typing import Any def convert_to_pytorch_benchmark_format( @@ -92,7 +92,7 @@ class TimeCollector: def __init__(self, scale: int) -> None: self.cnt: int = 0 self._sum: int = 0 - self._max: Optional[int] = None + self._max: int | None = None self.scale = scale self.start_time: int = time.monotonic_ns() @@ -104,13 +104,13 @@ class TimeCollector: else: self._max = max(self._max, v) - def avg(self) -> Union[float, str]: + def avg(self) -> float | str: return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A" - def max(self) -> Union[float, str]: + def max(self) -> float | str: return self._max / self.scale if self._max else "N/A" - def dump_avg_max(self) -> list[Union[float, str]]: + def dump_avg_max(self) -> list[float | str]: return [self.avg(), self.max()] def __enter__(self) -> None: @@ -118,8 +118,8 @@ class TimeCollector: def __exit__( self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - exc_traceback: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, ) -> None: self.collect(time.monotonic_ns() - self.start_time) diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index 9ec270bbd2e988b459f99d876215984bf125d0f9..67fccdf4fd07e3aaa0feb29c64c114f7fdbb738f 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -6,8 +6,7 @@ import copy import itertools import pickle as pkl import time -from collections.abc import Iterable -from typing import Callable +from collections.abc import Callable, Iterable import torch import torch.utils.benchmark as TBenchmark @@ -16,7 +15,7 @@ from utils import make_rand_sparse_tensors from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index a5a5b52f603977cc0c967e455724bc7c6a2bf585..f7325ddd2cbbfef35bf1ffeb7e9ebb678c7e355b 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -6,8 +6,7 @@ import copy import itertools import pickle as pkl import time -from collections.abc import Iterable -from typing import Callable, Optional +from collections.abc import Callable, Iterable import torch import torch.utils.benchmark as TBenchmark @@ -17,9 +16,10 @@ from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - w8a8_block_fp8_matmul, + w8a8_triton_block_scaled_mm, ) -from vllm.utils import FlexibleArgumentParser, cdiv +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.math_utils import cdiv DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -53,7 +53,7 @@ def bench_int8( n: int, label: str, sub_label: str, - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: """Benchmark INT8-based kernels.""" assert dtype == torch.int8 @@ -108,7 +108,7 @@ def bench_fp8( n: int, label: str, sub_label: str, - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: """Benchmark FP8-based kernels.""" assert dtype == torch.float8_e4m3fn @@ -158,7 +158,7 @@ def bench_fp8( "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16) ), - "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul( + "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm( a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128) ), "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm( @@ -183,7 +183,7 @@ def bench( n: int, label: str, sub_label: str, - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: if dtype == torch.int8: return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) @@ -201,7 +201,7 @@ def print_timers(timers: Iterable[TMeasurement]): def run( dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]], - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh index 2c72941cf7e5112724683a89585276e1bae6e9c0..d683835db96a4d0f720e3d86560694ee6af9b828 100644 --- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh @@ -55,9 +55,7 @@ benchmark() { output_len=$2 - CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=0 vllm serve $model \ --port 8100 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ @@ -65,9 +63,7 @@ benchmark() { '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=1 vllm serve $model \ --port 8200 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh index 0bbf7cd2b1c81e5d3d6641b705918a11a3feead6..35c86cc845221ae24395f89394c764d78eed5329 100644 --- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh @@ -38,16 +38,12 @@ wait_for_server() { launch_chunked_prefill() { model="meta-llama/Meta-Llama-3.1-8B-Instruct" # disagg prefill - CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=0 vllm serve $model \ --port 8100 \ --max-model-len 10000 \ --enable-chunked-prefill \ --gpu-memory-utilization 0.6 & - CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=1 vllm serve $model \ --port 8200 \ --max-model-len 10000 \ --enable-chunked-prefill \ @@ -62,18 +58,14 @@ launch_chunked_prefill() { launch_disagg_prefill() { model="meta-llama/Meta-Llama-3.1-8B-Instruct" # disagg prefill - CUDA_VISIBLE_DEVICES=0 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=0 vllm serve $model \ --port 8100 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & - CUDA_VISIBLE_DEVICES=1 python3 \ - -m vllm.entrypoints.openai.api_server \ - --model $model \ + CUDA_VISIBLE_DEVICES=1 vllm serve $model \ --port 8200 \ --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index 901524214469e8677cbb58c3078e8b1d724585bb..d809bf1db8cbca52a915051cec26a2d1e24d026a 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -3,10 +3,9 @@ import pickle as pkl import time -from collections.abc import Iterable +from collections.abc import Callable, Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]: def unfused_int8_impl( rms_norm_layer: RMSNorm, x: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, quant_dtype: torch.dtype, ): # Norm @@ -68,7 +67,7 @@ def unfused_int8_impl( def unfused_fp8_impl( rms_norm_layer: RMSNorm, x: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, quant_dtype: torch.dtype, ): # Norm @@ -85,7 +84,7 @@ def unfused_fp8_impl( def fused_impl( rms_norm_layer: RMSNorm, # this stores the weights x: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, quant_dtype: torch.dtype, ): out, _ = ops.rms_norm_dynamic_per_token_quant( diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py index f1e504499eaf612de382709911f966b0de782f84..11e3ac7f0c1fa6ab4e576ee872a79a0129adc13f 100644 --- a/benchmarks/kernels/bench_block_fp8_gemm.py +++ b/benchmarks/kernels/bench_block_fp8_gemm.py @@ -1,10 +1,18 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os + +# Disable DeepGEMM for this benchmark to use CUTLASS +os.environ["VLLM_USE_DEEP_GEMM"] = "0" + import torch from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - apply_w8a8_block_fp8_linear, + W8A8BlockFp8LinearOp, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, ) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( CUTLASS_BLOCK_FP8_SUPPORTED, @@ -39,13 +47,14 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass): fp8_info = torch.finfo(torch.float8_e4m3fn) fp8_max, fp8_min = fp8_info.max, fp8_info.min - # Create random FP8 tensors + # Create random input tensor (bfloat16, will be quantized by W8A8BlockFp8LinearOp) A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max + # Create quantized weight tensor B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) - # Create scales + # Create weight scales block_n, block_k = block_size[0], block_size[1] n_tiles = (N + block_n - 1) // block_n k_tiles = (K + block_k - 1) // block_k @@ -55,19 +64,25 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass): * factor_for_scale ) - # SM90 CUTLASS requires row-major format for scales - if use_cutlass and current_platform.is_device_capability(90): - Bs = Bs.T.contiguous() + # Create W8A8BlockFp8LinearOp instance + weight_group_shape = GroupShape(block_n, block_k) + act_quant_group_shape = GroupShape(1, block_k) # Per-token, per-group quantization + + linear_op = W8A8BlockFp8LinearOp( + weight_group_shape=weight_group_shape, + act_quant_group_shape=act_quant_group_shape, + cutlass_block_fp8_supported=use_cutlass, + use_aiter_and_is_supported=False, + ) def run(): - if use_cutlass: - return apply_w8a8_block_fp8_linear( - A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True - ) - else: - return apply_w8a8_block_fp8_linear( - A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False - ) + return linear_op.apply( + input=A_ref, + weight=B, + weight_scale=Bs, + input_scale=None, + bias=None, + ) return run diff --git a/benchmarks/kernels/bench_mxfp4_qutlass.py b/benchmarks/kernels/bench_mxfp4_qutlass.py new file mode 100644 index 0000000000000000000000000000000000000000..dfc7721876a1739055f851f86468ce01fb3fd670 --- /dev/null +++ b/benchmarks/kernels/bench_mxfp4_qutlass.py @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at). +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import copy +import itertools + +import torch +from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix +from weight_shapes import WEIGHT_SHAPES + +from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn +from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked +from vllm.triton_utils import triton + +PROVIDER_CFGS = { + "torch-bf16": dict(enabled=True), + "mxfp4": dict(no_a_quant=False, enabled=True), + "mxfp4-noquant": dict(no_a_quant=True, enabled=True), +} + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] + + +def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device): + return ( + deterministic_hadamard_matrix(group_size, dtype=dtype, device=device) + * group_size**-0.5 + ) + + +def _quant_weight_mxfp4( + b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str +): + weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx( + b, forward_hadamard_matrix, method="abs_max" + ) + weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton") + return weight_hf_e2m1, weight_hf_scale_block + + +def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device): + weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4( + b, forward_hadamard_matrix, device + ) + alpha = torch.tensor([1.0], device="cuda") + + if cfg["no_a_quant"]: + # Pre-quantize activation + input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx( + a, forward_hadamard_matrix, method="abs_max" + ) + input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton") + + def run(): + return matmul_mxf4_bf16_tn( + input_hf_e2m1, + weight_hf_e2m1, + input_hf_scale_block, + weight_hf_scale_block, + alpha, + ) + + return run + + # Quantize activation on-the-fly + def run(): + input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx( + a, forward_hadamard_matrix, method="abs_max" + ) + input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton") + return matmul_mxf4_bf16_tn( + input_hf_e2m1, + weight_hf_e2m1, + input_hf_scale_block, + weight_hf_scale_block, + alpha, + ) + + return run + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[ + 1, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1024, + 2048, + 4096, + 8192, + 16384, + 24576, + 32768, + ], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=_enabled, + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs MXFP4 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K, had_size): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device) + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles + ) + else: + cfg = PROVIDER_CFGS[provider] + run_quant = build_mxfp4_runner( + cfg, a, b, forward_hadamard_matrix, dtype, device + ) + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), rep=200, quantiles=quantiles + ) + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +def prepare_shapes(args): + out = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + out.append(KN) + return out + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.3-70B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) + args = parser.parse_args() + + for K, N, model in prepare_shapes(args): + for had_size in [32, 64, 128]: + print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:") + benchmark.run( + print_data=True, + show_plots=True, + save_path=f"bench_mxfp4_res_n{N}_k{K}", + N=N, + K=K, + had_size=had_size, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/bench_nvfp4_qutlass.py b/benchmarks/kernels/bench_nvfp4_qutlass.py new file mode 100644 index 0000000000000000000000000000000000000000..6fecc816f9466ce3d2cf7e0f1780da0ea50cf39d --- /dev/null +++ b/benchmarks/kernels/bench_nvfp4_qutlass.py @@ -0,0 +1,207 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at). +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import copy +import itertools + +import torch +from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm +from vllm._custom_ops import fusedQuantizeNv +from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked +from vllm.triton_utils import triton + +PROVIDER_CFGS = { + "torch-bf16": dict(enabled=True), + "nvfp4": dict(no_a_quant=False, enabled=True), + "nvfp4-noquant": dict(no_a_quant=True, enabled=True), +} + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] + + +def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device): + return ( + deterministic_hadamard_matrix(group_size, dtype=dtype, device=device) + * group_size**-0.5 + ) + + +def _quant_weight_nvfp4( + b: torch.Tensor, + forward_hadamard_matrix: torch.Tensor, + global_scale: torch.Tensor, + device: str, + M: int, + N: int, + K: int, +): + weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv( + b, forward_hadamard_matrix, global_scale + ) + weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view( + -1, K // 16 + ) + return weight_hf_e2m1, weight_hf_scale_block + + +def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K): + alpha = torch.tensor([1.0], device="cuda") + global_scale = torch.tensor([1.0], device="cuda") + weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4( + b, forward_hadamard_matrix, global_scale, device, M, N, K + ) + + if cfg["no_a_quant"]: + # Pre-quantize activation + input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv( + a, forward_hadamard_matrix, global_scale + ) + input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view( + -1, K // 16 + ) + + def run(): + return ops.cutlass_scaled_fp4_mm( + input_hf_e2m1, + weight_hf_e2m1, + input_hf_scale_block, + weight_hf_scale_block, + alpha, + torch.bfloat16, + ) + + return run + + # Quantize activation on-the-fly + def run(): + input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv( + a, forward_hadamard_matrix, global_scale + ) + input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view( + -1, K // 16 + ) + return ops.cutlass_scaled_fp4_mm( + input_hf_e2m1, + weight_hf_e2m1, + input_hf_scale_block, + weight_hf_scale_block, + alpha, + torch.bfloat16, + ) + + return run + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[ + 1, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1024, + 2048, + 4096, + 8192, + 16384, + 24576, + 32768, + ], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=_enabled, + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs NVFP4 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K, had_size): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device) + + quantiles = [0.5, 0.2, 0.8] + + if provider == "torch-bf16": + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles + ) + else: + cfg = PROVIDER_CFGS[provider] + run_quant = build_nvfp4_runner( + cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K + ) + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), rep=200, quantiles=quantiles + ) + + to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) + return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) + + +def prepare_shapes(args): + out = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + out.append(KN) + return out + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.3-70B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) + args = parser.parse_args() + + for K, N, model in prepare_shapes(args): + for had_size in [16, 32, 64, 128]: + print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:") + benchmark.run( + print_data=True, + show_plots=True, + save_path=f"bench_nvfp4_res_n{N}_k{K}", + N=N, + K=K, + had_size=had_size, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py index e08e5680c191ed844d5262c48e72a4ca534472dc..7792cfd03b0e49022d7365b9a96e8e41ad236a99 100644 --- a/benchmarks/kernels/bench_per_token_quant_fp8.py +++ b/benchmarks/kernels/bench_per_token_quant_fp8.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools -from typing import Callable +from collections.abc import Callable from unittest.mock import patch import pandas as pd @@ -10,7 +10,8 @@ import torch from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.triton_utils import triton -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE def with_triton_mode(fn): diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py index 93edbcc9391fcf696e7333d7945919c8248cfb75..66268b71b3de644b424429aa66ad7dc6216b3c3a 100644 --- a/benchmarks/kernels/benchmark_activation.py +++ b/benchmarks/kernels/benchmark_activation.py @@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation # noqa F401 from vllm.model_executor.custom_op import CustomOp from vllm.platforms import current_platform from vllm.triton_utils import triton -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE batch_size_range = [1, 16, 32, 64, 128] seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py index 66b44c27d6ee8cdea4a7081c8142b0e037b8fd0c..6bcb1798379576be0e49c7b374da05b5dfc87405 100644 --- a/benchmarks/kernels/benchmark_bitblas.py +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -28,7 +28,7 @@ except ImportError as e: from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser parser = FlexibleArgumentParser( description="Benchmark BitBLAS int4 on a specific target." diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py index 726a2a371d1099d7c260af675703e22c8d4b1f21..7982cbb1422c5e4dda0f8aff07a3bad91f85ad0a 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.scalar_type import scalar_types -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser WEIGHT_SHAPES_MOE = { "nvidia/DeepSeek-R1-FP4": [ diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index b419b2fa0e3eb84b4b8a340a3721731f343cdde6..027f67ad4db694896b30cd13fa877d2ce30a95a7 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser # Weight shapes for different models: [num_experts, topk, hidden_size, # intermediate_size] diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py index 4cbdde5a5b2cab30db5bda1beec1e1c240335326..b414efa6e330bca5f1f5e5604c60fa357d9803bc 100644 --- a/benchmarks/kernels/benchmark_device_communicators.py +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -22,8 +22,8 @@ Example: import json import os import time +from collections.abc import Callable from contextlib import nullcontext -from typing import Callable, Optional import torch import torch.distributed as dist @@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import ( ) from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator from vllm.logger import init_logger -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser logger = init_logger(__name__) @@ -264,12 +264,12 @@ class CommunicatorBenchmark: def benchmark_allreduce_single( self, sequence_length: int, - allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]], + allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None], should_use_fn: Callable[[torch.Tensor], bool], context, num_warmup: int, num_trials: int, - ) -> Optional[float]: + ) -> float | None: """Benchmark method with CUDA graph optimization.""" try: # Create test tensor (2D: sequence_length x hidden_size) diff --git a/benchmarks/kernels/benchmark_fused_collective.py b/benchmarks/kernels/benchmark_fused_collective.py new file mode 100644 index 0000000000000000000000000000000000000000..38e7fdcf55426f1e50424a2715e1f8323dcff729 --- /dev/null +++ b/benchmarks/kernels/benchmark_fused_collective.py @@ -0,0 +1,1129 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Benchmark for FlashInfer fused collective operations vs standard operations. + +This benchmark compares: +1. FlashInfer's trtllm_allreduce_fusion (fused allreduce + rmsnorm + optional quant) +2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations + +Usage with torchrun: + torchrun --nproc_per_node=2 benchmark_fused_collective.py + +""" + +import argparse +import itertools +import os +import time + +import pandas as pd +import torch # type: ignore +import torch.distributed as dist # type: ignore + +from vllm.config.vllm import CompilationConfig, VllmConfig, set_current_vllm_config +from vllm.distributed import ( + get_tp_group, + tensor_model_parallel_all_reduce, +) +from vllm.distributed.parallel_state import ( + graph_capture, + init_distributed_environment, + initialize_model_parallel, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.layernorm import RMSNorm # noqa +from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 # noqa +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape # noqa +from vllm.platforms import current_platform # noqa + +RMS_NORM_OP = torch.ops._C.rms_norm +FUSED_ADD_RMS_NORM_OP = torch.ops._C.fused_add_rms_norm +RMS_NORM_STATIC_FP8_QUANT_OP = torch.ops._C.rms_norm_static_fp8_quant +FUSED_ADD_RMS_NORM_STATIC_FP8_QUANT_OP = ( + torch.ops._C.fused_add_rms_norm_static_fp8_quant +) +SCALED_FP4_QUANT_OP = torch.ops._C.scaled_fp4_quant + +logger = init_logger(__name__) + +# Try to import FlashInfer +try: + import flashinfer.comm as flashinfer_comm # type: ignore + + if not hasattr(flashinfer_comm, "trtllm_allreduce_fusion"): + flashinfer_comm = None + logger.warning( + "FlashInfer comm module found but missing trtllm_allreduce_fusion" + ) +except ImportError: + flashinfer_comm = None + logger.warning("FlashInfer not found, only benchmarking standard operations") + +# Constants +FP8_DTYPE = current_platform.fp8_dtype() +MiB = 1024 * 1024 + +# FlashInfer max sizes per world size +# Enable 64MB for 2, 4, 8 world sizes to verify large input sizes +# use --disable-oneshot to disable oneshot mode for very large input sizes +_FI_MAX_SIZES = { + 2: 64 * MiB, # 64MB + 4: 64 * MiB, # 64MB + 8: 64 * MiB, # 64MB +} + +# Global workspace tensor for FlashInfer +_FI_WORKSPACE_TENSOR = None + + +def setup_flashinfer_workspace( + world_size: int, + rank: int, + hidden_dim: int, + max_token_num: int, + use_fp32_lamport: bool = False, +): + """Setup FlashInfer workspace for fused allreduce operations.""" + global _FI_WORKSPACE_TENSOR + + if flashinfer_comm is None: + return None, None + + if world_size not in _FI_MAX_SIZES: + logger.warning("FlashInfer not supported for world size %s", world_size) + return None, None + + try: + # Create IPC workspace + ipc_handles, workspace_tensor = ( + flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion( + tp_rank=rank, + tp_size=world_size, + max_token_num=max_token_num, + hidden_dim=hidden_dim, + group=get_tp_group().device_group, + use_fp32_lamport=use_fp32_lamport, + ) + ) + + _FI_WORKSPACE_TENSOR = workspace_tensor + return ipc_handles, workspace_tensor + except Exception as e: + logger.error("Failed to setup FlashInfer workspace: %s", e) + return None, None + + +def cleanup_flashinfer_workspace(ipc_handles): + """Cleanup FlashInfer workspace.""" + if flashinfer_comm is None or ipc_handles is None: + return + + try: + group = get_tp_group().device_group + flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(ipc_handles, group) + except Exception as e: + logger.error("Failed to cleanup FlashInfer workspace: %s", e) + + +class FlashInferFusedAllReduceParams: + """Parameters for FlashInfer fused allreduce operations.""" + + def __init__( + self, + rank: int, + world_size: int, + use_fp32_lamport: bool = False, + max_token_num: int = 1024, + ): + self.rank = rank + self.world_size = world_size + self.use_fp32_lamport = use_fp32_lamport + self.trigger_completion_at_end = True + self.launch_with_pdl = True + self.fp32_acc = True + self.max_token_num = max_token_num + + def get_trtllm_fused_allreduce_kwargs(self): + return { + "world_rank": self.rank, + "world_size": self.world_size, + "launch_with_pdl": self.launch_with_pdl, + "trigger_completion_at_end": self.trigger_completion_at_end, + "fp32_acc": self.fp32_acc, + } + + +def flashinfer_fused_allreduce_rmsnorm( + input_tensor: torch.Tensor, + residual: torch.Tensor | None, + rms_gamma: torch.Tensor, + rms_eps: float, + allreduce_params: "FlashInferFusedAllReduceParams", + use_oneshot: bool, + norm_out: torch.Tensor | None = None, +): + """FlashInfer fused allreduce + rmsnorm operation.""" + if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None: + raise RuntimeError("FlashInfer not available or workspace not initialized") + + if norm_out is None: + norm_out = input_tensor + residual_out = residual + else: + residual_out = input_tensor + + flashinfer_comm.trtllm_allreduce_fusion( + allreduce_in=input_tensor, + token_num=input_tensor.shape[0], + residual_in=residual, + residual_out=residual_out, + norm_out=norm_out, + rms_gamma=rms_gamma, + rms_eps=rms_eps, + hidden_dim=input_tensor.shape[-1], + workspace_ptrs=_FI_WORKSPACE_TENSOR, + pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm, + allreduce_out=None, + quant_out=None, + scale_out=None, + layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, + scale_factor=None, + use_oneshot=use_oneshot, + **allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + + +def flashinfer_fused_allreduce_rmsnorm_fp8_quant( + input_tensor: torch.Tensor, + residual: torch.Tensor | None, + rms_gamma: torch.Tensor, + rms_eps: float, + scale_factor: torch.Tensor, + allreduce_params: FlashInferFusedAllReduceParams, + use_oneshot: bool = True, + norm_out: torch.Tensor | None = None, + quant_out: torch.Tensor | None = None, +): + """FlashInfer fused allreduce + rmsnorm + FP8 quantization.""" + if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None: + raise RuntimeError("FlashInfer not available or workspace not initialized") + + if norm_out is None: + norm_out = input_tensor + residual_out = residual + else: + residual_out = input_tensor + + flashinfer_comm.trtllm_allreduce_fusion( + allreduce_in=input_tensor, + token_num=input_tensor.shape[0], + residual_in=residual, + residual_out=residual_out, + norm_out=norm_out, + rms_gamma=rms_gamma, + rms_eps=rms_eps, + hidden_dim=input_tensor.shape[-1], + workspace_ptrs=_FI_WORKSPACE_TENSOR, + pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, + allreduce_out=None, + quant_out=quant_out, + scale_out=None, + layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, + scale_factor=scale_factor, + use_oneshot=use_oneshot, + **allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + + +def flashinfer_fused_allreduce_rmsnorm_fp4_quant( + input_tensor: torch.Tensor, + residual: torch.Tensor | None, + rms_gamma: torch.Tensor, + rms_eps: float, + input_global_scale: torch.Tensor, + allreduce_params: FlashInferFusedAllReduceParams, + quant_out: torch.Tensor, + use_oneshot: bool, + output_scale: torch.Tensor, + norm_out: torch.Tensor | None = None, +): + """FlashInfer fused allreduce + rmsnorm + FP4 quantization.""" + if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None: + raise RuntimeError("FlashInfer not available or workspace not initialized") + + if norm_out is None: + norm_out = input_tensor + residual_out = residual + else: + residual_out = input_tensor + + flashinfer_comm.trtllm_allreduce_fusion( + allreduce_in=input_tensor, + token_num=input_tensor.shape[0], + residual_in=residual, + residual_out=residual_out, + norm_out=norm_out, + rms_gamma=rms_gamma, + rms_eps=rms_eps, + hidden_dim=input_tensor.shape[-1], + workspace_ptrs=_FI_WORKSPACE_TENSOR, + pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, + allreduce_out=None, + quant_out=quant_out, + scale_out=output_scale, + layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, + scale_factor=input_global_scale, + use_oneshot=use_oneshot, + **allreduce_params.get_trtllm_fused_allreduce_kwargs(), + ) + + +class VllmFusedAllreduce: + def __init__(self, hidden_dim, dtype): + self.rms_eps = 1e-6 + self.rms_norm = RMSNorm(hidden_dim, eps=self.rms_eps, dtype=dtype) + self.fp8_quant = QuantFP8( + static=True, + group_shape=GroupShape.PER_TENSOR, + ) + + def allreduce_rmsnorm( + self, input_tensor: torch.Tensor, residual: torch.Tensor | None + ): + allreduce_out = tensor_model_parallel_all_reduce(input_tensor) + return self.rms_norm(allreduce_out, residual) + + def allreduce_rmsnorm_fp8_quant( + self, + input_tensor: torch.Tensor, + residual: torch.Tensor | None, + scale_factor: torch.Tensor, + ): + allreduce_out = tensor_model_parallel_all_reduce(input_tensor) + rms_out = self.rms_norm(allreduce_out, residual) + if residual is None: + quant_out = self.fp8_quant(rms_out, scale_factor) + return quant_out + else: + rms_out, residual_out = rms_out + quant_out = self.fp8_quant(rms_out, scale_factor) + return quant_out, residual_out + + def allreduce_rmsnorm_fp4_quant( + self, + input_tensor: torch.Tensor, + residual: torch.Tensor | None, + input_global_scale: torch.Tensor, + quant_out: torch.Tensor, + output_scale: torch.Tensor, + ): + allreduce_out = tensor_model_parallel_all_reduce(input_tensor) + rms_out = self.rms_norm(allreduce_out, residual) + if residual is None: + SCALED_FP4_QUANT_OP(quant_out, rms_out, output_scale, input_global_scale) + return quant_out, output_scale + else: + rms_out, residual_out = rms_out + SCALED_FP4_QUANT_OP(quant_out, rms_out, output_scale, input_global_scale) + return quant_out, residual_out, output_scale + + +def create_test_tensors( + num_tokens: int, hidden_dim: int, dtype: torch.dtype, use_residual: bool = True +): + """Create test tensors for benchmarking.""" + input_tensor = torch.randn(num_tokens, hidden_dim, dtype=dtype) + residual = ( + torch.randn_like(input_tensor) + if use_residual + else torch.zeros_like(input_tensor) + ) + rms_gamma = torch.ones(hidden_dim, dtype=dtype) + norm_out = None if use_residual else torch.empty_like(input_tensor) + + # Quantization scales + scale_fp8 = torch.tensor(1.0, dtype=torch.float32) + scale_fp4 = torch.tensor(1.0, dtype=torch.float32) + quant_out_fp8 = torch.empty_like(input_tensor, dtype=FP8_DTYPE) + # Pre-allocate FP4 output tensors (to avoid allocation overhead in benchmarks) + fp4_quant_out = torch.empty((num_tokens, hidden_dim // 2), dtype=torch.uint8) + fp4_output_scale = torch.empty((128, 4), dtype=torch.int32) + + return ( + input_tensor, + norm_out, + residual, + rms_gamma, + scale_fp8, + quant_out_fp8, + scale_fp4, + fp4_quant_out, + fp4_output_scale, + ) + + +def benchmark_operation( + operation_func, *args, warmup: int = 5, trials: int = 20, **kwargs +): + """Benchmark a single operation using CUDA graphs.""" + # Warmup before graph capture + for _ in range(warmup): + operation_func(*args, **kwargs) + torch.cuda.synchronize() + + # Create CUDA graph + graph = torch.cuda.CUDAGraph() + num_op_per_cudagraph = 10 + + # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe + device = torch.device(f"cuda:{torch.cuda.current_device()}") + with graph_capture(device=device), torch.cuda.graph(graph): + for _ in range(num_op_per_cudagraph): + operation_func(*args, **kwargs) + + # Graph warmup + torch.cuda.synchronize() + for _ in range(warmup): + graph.replay() + + # Benchmark with CUDA graph + torch.cuda.synchronize() + start_time = time.perf_counter() + + for _ in range(trials // num_op_per_cudagraph): + # operation_func(*args, **kwargs) + graph.replay() + + torch.cuda.synchronize() + end_time = time.perf_counter() + + avg_time_ms = ((end_time - start_time) / trials) * 1000 + return avg_time_ms + + +def run_benchmarks( + num_tokens: int, + hidden_dim: int, + dtype: torch.dtype, + use_residual: bool, + allreduce_params: FlashInferFusedAllReduceParams | None, + quant_modes: set[str], + no_oneshot: bool, +): + """Run all benchmarks for given configuration. + + Args: + quant_mode: "none", "fp8_only", "fp4_only", or "all" + """ + ( + input_tensor, + norm_out, + residual, + rms_gamma, + scale_fp8, + quant_out_fp8, + scale_fp4, + fp4_quant_out, + fp4_output_scale, + ) = create_test_tensors(num_tokens, hidden_dim, dtype, use_residual) + + rms_eps = 1e-6 + results = {} + vllm_fused_allreduce = VllmFusedAllreduce(hidden_dim, dtype) + use_oneshot_options = [False] if no_oneshot else [True, False] + + # Create RMSNorm and QuantFP8 layers once for native benchmarks + + if "none" in quant_modes: + # Standard AllReduce + RMSNorm + for custom_op in ["-rms_norm", "+rms_norm"]: + with set_current_vllm_config( + VllmConfig(compilation_config=CompilationConfig(custom_ops=[custom_op])) + ): + try: + suffix = ( + "_custom_rms_norm" if "+" in custom_op else "_native_rms_norm" + ) + time_ms = benchmark_operation( + vllm_fused_allreduce.allreduce_rmsnorm, + input_tensor, + residual=residual, + ) + results[f"standard_allreduce_{suffix}"] = time_ms + except Exception as e: + logger.error("Standard AllReduce+RMSNorm failed: %s", e) + results[f"standard_allreduce_{suffix}"] = float("inf") + + # Standard AllReduce + RMSNorm Native Compiled + with set_current_vllm_config( + VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"])) + ): + try: + standard_allreduce_rmsnorm_native_compiled = torch.compile( + vllm_fused_allreduce.allreduce_rmsnorm, + fullgraph=True, + dynamic=False, + ) + time_ms = benchmark_operation( + standard_allreduce_rmsnorm_native_compiled, + input_tensor, + residual=residual, + ) + results["standard_allreduce_rmsnorm_native_compiled"] = time_ms + except Exception as e: + logger.error("Standard AllReduce+RMSNorm Native Compiled failed: %s", e) + results["standard_allreduce_rmsnorm_native_compiled"] = float("inf") + + # FlashInfer Fused AllReduce + RMSNorm Oneshot/Twoshot + if flashinfer_comm is not None and allreduce_params is not None: + for use_oneshot in use_oneshot_options: + suffix = "_oneshot" if use_oneshot else "_twoshot" + try: + time_ms = benchmark_operation( + flashinfer_fused_allreduce_rmsnorm, + input_tensor, + residual=residual, + norm_out=norm_out, + rms_gamma=rms_gamma, + rms_eps=rms_eps, + allreduce_params=allreduce_params, + use_oneshot=use_oneshot, + ) + results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = time_ms + except Exception as e: + logger.error("FlashInfer Fused AllReduce+RMSNorm failed: %s", e) + results[f"flashinfer_fused_allreduce_rmsnorm{suffix}"] = float( + "inf" + ) + + if "fp8" in quant_modes: + # Standard AllReduce + RMSNorm + FP8 Quant + for rms_norm_custom_op in ["-rms_norm", "+rms_norm"]: + suffix = ( + "_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm" + ) + for quant_fp8_custom_op in ["-quant_fp8", "+quant_fp8"]: + suffix += ( + "_custom_quant_fp8" + if "+" in quant_fp8_custom_op + else "_native_quant_fp8" + ) + with set_current_vllm_config( + VllmConfig( + compilation_config=CompilationConfig( + custom_ops=[rms_norm_custom_op, quant_fp8_custom_op] + ) + ) + ): + try: + time_ms = benchmark_operation( + vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant, + input_tensor, + residual=residual, + scale_factor=scale_fp8, + ) + results[f"standard_allreduce{suffix}"] = time_ms + except Exception as e: + logger.error("Standard AllReduce+RMSNorm+FP8 failed: %s", e) + results[f"standard_allreduce{suffix}"] = float("inf") + + # Standard AllReduce + RMSNorm + FP8 Quant Native Compiled + with set_current_vllm_config( + VllmConfig( + compilation_config=CompilationConfig( + custom_ops=["-rms_norm", "-quant_fp8"] + ) + ) + ): + try: + standard_allreduce_rmsnorm_fp8_quant_native_compiled = torch.compile( + vllm_fused_allreduce.allreduce_rmsnorm_fp8_quant, + fullgraph=True, + dynamic=False, + ) + time_ms = benchmark_operation( + standard_allreduce_rmsnorm_fp8_quant_native_compiled, + input_tensor, + residual=residual, + scale_factor=scale_fp8, + ) + results["standard_allreduce_rmsnorm_fp8_quant_native_compiled"] = ( + time_ms + ) + except Exception as e: + logger.error( + "Standard AllReduce+RMSNorm+FP8 Native Compiled failed: %s", e + ) + results["standard_allreduce_rmsnorm_fp8_quant_native_compiled"] = float( + "inf" + ) + + # FlashInfer Fused AllReduce + RMSNorm + FP8 Quant Oneshot + if flashinfer_comm is not None and allreduce_params is not None: + for use_oneshot in use_oneshot_options: + suffix = "_oneshot" if use_oneshot else "_twoshot" + try: + time_ms = benchmark_operation( + flashinfer_fused_allreduce_rmsnorm_fp8_quant, + input_tensor, + norm_out=norm_out, + residual=residual, + rms_gamma=rms_gamma, + rms_eps=rms_eps, + scale_factor=scale_fp8, + quant_out=quant_out_fp8, + allreduce_params=allreduce_params, + use_oneshot=use_oneshot, + ) + results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = ( + time_ms + ) + except Exception as e: + logger.error( + "FlashInfer Fused AllReduce+RMSNorm+FP8 Oneshot failed: %s", + e, + ) + results[f"flashinfer_fused_allreduce_rmsnorm_fp8_quant{suffix}"] = ( + float("inf") + ) + + if "fp4" in quant_modes and current_platform.has_device_capability(100): + # Standard AllReduce + RMSNorm + FP4 Quant + for rms_norm_custom_op in ["-rms_norm", "+rms_norm"]: + suffix = ( + "_custom_rms_norm" if "+" in rms_norm_custom_op else "_native_rms_norm" + ) + with set_current_vllm_config( + VllmConfig( + compilation_config=CompilationConfig( + custom_ops=[rms_norm_custom_op] + ) + ) + ): + try: + time_ms = benchmark_operation( + vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant, + input_tensor, + residual=residual, + input_global_scale=scale_fp4, + quant_out=fp4_quant_out, + output_scale=fp4_output_scale, + ) + results[f"standard_allreduce_{suffix}_fp4_quant"] = time_ms + except Exception as e: + logger.error("Standard AllReduce+RMSNorm+FP4 failed: %s", e) + results[f"standard_allreduce_{suffix}_fp4_quant"] = float("inf") + + # Standard AllReduce + RMSNorm + FP4 Quant Native Compiled + with set_current_vllm_config( + VllmConfig(compilation_config=CompilationConfig(custom_ops=["-rms_norm"])) + ): + try: + standard_allreduce_rmsnorm_fp4_quant_native_compiled = torch.compile( + vllm_fused_allreduce.allreduce_rmsnorm_fp4_quant, + fullgraph=True, + dynamic=False, + ) + time_ms = benchmark_operation( + standard_allreduce_rmsnorm_fp4_quant_native_compiled, + input_tensor, + residual=residual, + quant_out=fp4_quant_out, + input_global_scale=scale_fp4, + output_scale=fp4_output_scale, + ) + results["standard_allreduce_rmsnorm_fp4_quant_native_compiled"] = ( + time_ms + ) + except Exception as e: + logger.error( + "Standard AllReduce+RMSNorm+FP4 Native Compiled failed: %s", e + ) + results["standard_allreduce_rmsnorm_fp4_quant_native_compiled"] = float( + "inf" + ) + + # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Oneshot + if flashinfer_comm is not None and allreduce_params is not None: + for use_oneshot in use_oneshot_options: + suffix = "_oneshot" if use_oneshot else "_twoshot" + try: + time_ms = benchmark_operation( + flashinfer_fused_allreduce_rmsnorm_fp4_quant, + input_tensor, + residual=residual, + norm_out=norm_out, + rms_gamma=rms_gamma, + rms_eps=rms_eps, + input_global_scale=scale_fp4, + allreduce_params=allreduce_params, + quant_out=fp4_quant_out, + output_scale=fp4_output_scale, + use_oneshot=use_oneshot, + ) + results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = ( + time_ms + ) + except Exception as e: + logger.error( + "FlashInfer Fused AllReduce+RMSNorm+FP4 Oneshot failed: %s", + e, + ) + results[f"flashinfer_fused_allreduce_rmsnorm_fp4_quant{suffix}"] = ( + float("inf") + ) + + # FlashInfer Fused AllReduce + RMSNorm + FP4 Quant Two-shot + if flashinfer_comm is not None and allreduce_params is not None: + try: + time_ms = benchmark_operation( + flashinfer_fused_allreduce_rmsnorm_fp4_quant, + input_tensor, + residual=residual, + norm_out=norm_out, + rms_gamma=rms_gamma, + rms_eps=rms_eps, + input_global_scale=scale_fp4, + allreduce_params=allreduce_params, + quant_out=fp4_quant_out, + output_scale=fp4_output_scale, + use_oneshot=False, + ) + results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = ( + time_ms + ) + except Exception as e: + logger.error( + "FlashInfer Fused AllReduce+RMSNorm+FP4 Two-shot failed: %s", + e, + ) + results["flashinfer_fused_allreduce_rmsnorm_fp4_quant_twoshot"] = float( + "inf" + ) + + return results + + +def prepare_results_with_speedups(results_dict): + """Prepare results with speedup calculations based on dynamic baseline selection.""" + prepared_results = [] + + # Determine the fastest baseline for each operation type + def get_fastest_baseline(op_name, results_dict): + """Get the fastest baseline between standard and native_compiled versions.""" + if "fp8_quant" in op_name: + candidates = [ + "standard_allreduce_rmsnorm_fp8_quant", + "standard_allreduce_rmsnorm_fp8_quant_native_compiled", + ] + elif "fp4_quant" in op_name: + candidates = [ + "standard_allreduce_rmsnorm_fp4_quant", + "standard_allreduce_rmsnorm_fp4_quant_native_compiled", + ] + else: + candidates = [ + "standard_allreduce_rmsnorm", + "standard_allreduce_rmsnorm_native_compiled", + ] + + # Find the fastest among available candidates + fastest_time = float("inf") + fastest_baseline = None + + for candidate in candidates: + if ( + candidate in results_dict + and results_dict[candidate] != float("inf") + and results_dict[candidate] < fastest_time + ): + fastest_time = results_dict[candidate] + fastest_baseline = candidate + + return fastest_baseline + + # Create dynamic baseline mapping + dynamic_baseline_mapping = {} + for op_name in results_dict: + if ( + op_name.startswith("flashinfer_") + or op_name.startswith("standard_") + and not op_name.endswith("_native_compiled") + ): + dynamic_baseline_mapping[op_name] = get_fastest_baseline( + op_name, results_dict + ) + + for op_name, time_ms in results_dict.items(): + if time_ms == float("inf"): + speedup_str = "FAILED" + time_str = "FAILED" + else: + time_str = f"{time_ms:.3f}" + # Find the appropriate baseline for this operation + baseline_op = dynamic_baseline_mapping.get(op_name) + if baseline_op and baseline_op in results_dict: + baseline_time = results_dict[baseline_op] + if baseline_time != float("inf") and baseline_time > 0: + speedup = baseline_time / time_ms + speedup_str = f"{speedup:.2f}x" + else: + speedup_str = "N/A" + else: + # For baseline operations, determine if this is the fastest baseline + if op_name.endswith("_native_compiled") or ( + op_name.startswith("standard_") + and not op_name.endswith("_native_compiled") + ): + fastest_baseline = get_fastest_baseline(op_name, results_dict) + if fastest_baseline == op_name: + speedup_str = "baseline" + else: + if fastest_baseline and fastest_baseline in results_dict: + baseline_time = results_dict[fastest_baseline] + if baseline_time != float("inf") and baseline_time > 0: + speedup = baseline_time / time_ms + speedup_str = f"{speedup:.2f}x" + else: + speedup_str = "N/A" + else: + speedup_str = "N/A" + else: + speedup_str = "N/A" + + prepared_results.append( + { + "operation": op_name, + "time_ms": time_ms, + "time_str": time_str, + "speedup_str": speedup_str, + } + ) + + return prepared_results + + +def print_results( + results_dict, + num_tokens, + hidden_dim, + dtype, + use_residual, + quant_modes, + input_size_mb, +): + """Print benchmark results in a formatted table.""" + print(f"\n{'=' * 80}") + print( + f"Results: num_tokens={num_tokens}, hidden_dim={hidden_dim} " + f"(input size: {input_size_mb:.2f} MB)" + ) + print( + f"dtype={dtype}, residual={'yes' if use_residual else 'no'}, " + f"quant_modes={','.join(sorted(list(quant_modes)))}" + ) + print(f"{'=' * 80}") + print(f"{'Operation':<50} {'Time (ms)':<12} {'Speedup':<10}") + print(f"{'-' * 80}") + + # Prepare results with speedup calculations + prepared_results = prepare_results_with_speedups(results_dict) + + for result in prepared_results: + if result["time_ms"] == float("inf"): + time_display = result["time_str"] + else: + time_display = f"{result['time_ms']:.3f}" + + print( + f"{result['operation']:<50} {time_display:<12} {result['speedup_str']:<10}" + ) + + +def format_results_markdown( + all_results: list[dict], world_size: int, args: argparse.Namespace +) -> str: + """Format all benchmark results as markdown.""" + lines: list[str] = [] + lines.append("# FlashInfer Fused Collective Operations Benchmark Results") + lines.append("") + lines.append(f"**World Size:** {world_size} ") + lines.append(f"**Hidden Dimension:** {args.hidden_dim} ") + lines.append(f"**Warmup Iterations:** {args.warmup} ") + lines.append(f"**Benchmark Trials:** {args.trials} ") + modes = ",".join(all_results[0]["quant_modes"]) if all_results else "N/A" + lines.append(f"**Quantization Modes:** {modes} ") + lines.append("") + lines.append("---") + lines.append("") + + for entry in all_results: + num_tokens = entry["num_tokens"] + dtype = entry["dtype"] + use_residual = entry["use_residual"] + results_dict = entry["results"] + input_size_mb = entry["input_size_mb"] + residual_str = "with residual" if use_residual else "no residual" + + lines.append( + f"## Configuration: num_tokens={num_tokens}, dtype={dtype}, {residual_str}" + ) + lines.append(f"**Input Size:** {input_size_mb:.2f} MB") + lines.append("") + + prepared = prepare_results_with_speedups(results_dict) + # Build DataFrame for markdown export + rows = [ + { + "Operation": r["operation"].replace("_", " ").title(), + "Time (ms)": r["time_str"], + "Speedup": r["speedup_str"], + } + for r in prepared + ] + df = pd.DataFrame(rows) + if df.empty: + lines.append("No results.") + else: + lines.append(df.to_markdown(index=False)) + lines.append("") + + return "\n".join(lines) + + +def save_results_to_file( + all_results: list[dict], world_size: int, args: argparse.Namespace, rank: int +): + """Save benchmark results to markdown file (only on rank 0).""" + if rank != 0: + return + + if not all_results: + logger.warning("No results to save") + return + + output_path = args.output_file + + try: + markdown_content = format_results_markdown(all_results, world_size, args) + + with open(output_path, "a") as f: + f.write(markdown_content) + + except Exception as e: + logger.error("Failed to save results to file: %s", e) + + +def main(): + parser = argparse.ArgumentParser( + description="Benchmark fused collective operations" + ) + parser.add_argument( + "--num-tokens", + type=int, + nargs="+", + default=[128, 512, 1024, 2048], + help="Numbers of tokens to test", + ) + parser.add_argument( + "--hidden-dim", type=int, default=8192, help="Hidden dimension size" + ) + parser.add_argument( + "--dtypes", + type=str, + nargs="+", + default=["bfloat16"], + choices=["float16", "bfloat16", "float32"], + help="Data types to test", + ) + parser.add_argument( + "--no-residual", + action="store_true", + help="Skip residual connection tests", + ) + + parser.add_argument( + "--quant-modes", + type=str, + default="none,fp8,fp4", + help=( + "Comma-separated quantization modes to run: none, fp8, fp4. " + "Default: none,fp8,fp4" + ), + ) + + parser.add_argument( + "--warmup", type=int, default=5, help="Number of warmup iterations" + ) + parser.add_argument( + "--trials", type=int, default=20, help="Number of benchmark trials" + ) + parser.add_argument( + "--output-file", + type=str, + help="""Output file path for markdown results + (default: benchmark_results_.md) + """, + ) + + parser.add_argument( + "--no-oneshot", + action="store_true", + help="Skip oneshot benchmarks", + ) + + args = parser.parse_args() + + # Check if running with torchrun (required for collective operations) + if "RANK" not in os.environ or "WORLD_SIZE" not in os.environ: + raise RuntimeError( + "Must run with torchrun for distributed benchmarking. " + "Example: torchrun --nproc_per_node=2 benchmark_fused_collective.py" + ) + + # Initialize distributed environment + rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(device) + torch.set_default_device(device) + + init_distributed_environment() + initialize_model_parallel(tensor_model_parallel_size=world_size) + + # Validate world size (must be > 1 for collective operations) + if world_size <= 1: + raise ValueError( + "World size must be > 1 for collective operations benchmarking. " + f"Current world size: {world_size}. Use torchrun with --nproc_per_node > 1." + ) + + # Parse quantization modes + valid_quant_modes = {"none", "fp8", "fp4"} + raw_modes = [ + m.strip().lower() for m in (args.quant_modes or "").split(",") if m.strip() + ] + quant_modes = set(raw_modes) if raw_modes else {"none", "fp8", "fp4"} + invalid = sorted(list(quant_modes - valid_quant_modes)) + if invalid: + raise ValueError( + f"Invalid --quant-modes entries: {','.join(invalid)}. " + f"Valid options are: {','.join(sorted(valid_quant_modes))}." + ) + + if rank == 0: + logger.info("Running benchmark with world_size=%s, rank=%s", world_size, rank) + logger.info("Quantization modes: %s", ",".join(sorted(list(quant_modes)))) + if flashinfer_comm is not None: + logger.info( + "FlashInfer available - will benchmark fused operations", + ) + else: + logger.info( + "FlashInfer not available - only benchmarking standard operations" + ) + + # Convert dtype strings to torch dtypes + dtype_map = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "float32": torch.float32, + } + dtypes = [dtype_map[dt] for dt in args.dtypes] + + # Test configurations + residual_options = [True] if not args.no_residual else [False] + + configs = list(itertools.product(args.num_tokens, dtypes, residual_options)) + + # Setup FlashInfer workspace if available + ipc_handles = None + allreduce_params = None + + if flashinfer_comm is not None: + # Use the largest hidden dimension for workspace setup + max_num_token = _FI_MAX_SIZES.get(world_size) // ( + args.hidden_dim * world_size * 2 + ) + + ipc_handles, workspace_tensor = setup_flashinfer_workspace( + world_size, rank, args.hidden_dim, max_num_token + ) + + if workspace_tensor is not None: + allreduce_params = FlashInferFusedAllReduceParams( + rank=rank, + world_size=world_size, + max_token_num=max_num_token, + ) + + # Collect all results for markdown export + all_results = [] + + try: + # Run benchmarks + for num_tokens, dtype, use_residual in configs: + if rank == 0: + logger.info( + "\nTesting: num_tokens=%s, hidden_dim=%s, dtype=%s, residual=%s", + num_tokens, + args.hidden_dim, + dtype, + use_residual, + ) + + results = run_benchmarks( + num_tokens, + args.hidden_dim, + dtype, + use_residual, + allreduce_params, + quant_modes=quant_modes, + no_oneshot=args.no_oneshot, + ) + + # Store results for markdown export + if rank == 0: + # Calculate input size in MB + input_size_mb = ( + num_tokens * args.hidden_dim * torch.finfo(dtype).bits + ) / (8 * 1024 * 1024) + all_results.append( + { + "num_tokens": num_tokens, + "hidden_dim": args.hidden_dim, + "dtype": str(dtype).replace("torch.", ""), + "use_residual": use_residual, + "quant_modes": sorted(list(quant_modes)), + "input_size_mb": input_size_mb, + "results": results, + } + ) + + print_results( + results, + num_tokens, + args.hidden_dim, + dtype, + use_residual, + quant_modes, + input_size_mb, + ) + + # Save results to markdown file + if args.output_file and rank == 0: + save_results_to_file(all_results, world_size, args, rank) + + finally: + # Cleanup + if ipc_handles is not None: + cleanup_flashinfer_workspace(ipc_handles) + + dist.barrier() + + +if __name__ == "__main__": + main() diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 14330ae6f03c55bb8bddb42689740b5c72bb324e..9b426d8d5f778a483aab12fa82d451460b940274 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -13,11 +13,11 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_experts, fused_topk, ) -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = [ - "nm-testing/Mixtral-8x7B-Instruct-v0.1", - "nm-testing/deepseekv2-lite", + "mistralai/Mixtral-8x7B-Instruct-v0.1", + "deepseek-ai/DeepSeek-V2-Lite", "ibm-granite/granite-3.0-1b-a400m", "ibm-granite/granite-3.0-3b-a800m", ] diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index 69978ec6b23e94ce21d8f3dfb633317b3803e6ef..6fa5c248670e32cdd672e45e016d3c47d9689577 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -7,7 +7,8 @@ import torch from vllm.model_executor.layers.layernorm import RMSNorm from vllm.platforms import current_platform -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE @torch.inference_mode() diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 799b16999873f41ef39d3a302874602201e332e6..6715c9b548aa1adabb372fab1e4d562b48c6e112 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -6,11 +6,12 @@ import copy import json import pickle import time +from collections.abc import Callable from dataclasses import dataclass from enum import Enum, auto from itertools import product from pathlib import Path -from typing import Any, Callable, Optional +from typing import Any import torch import torch.utils.benchmark as TBenchmark @@ -18,13 +19,24 @@ from torch.utils.benchmark import Measurement as TMeasurement from utils import ArgPool, Bench, CudaGraphBenchParams from weight_shapes import WEIGHT_SHAPES -from vllm.triton_utils import HAS_TRITON +from vllm.lora.ops.triton_ops.utils import get_lora_op_configs +from vllm.triton_utils import HAS_TRITON, triton if HAS_TRITON: - from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink + from vllm.lora.ops.triton_ops import ( ## added fused_moe_lora + LoRAKernelMeta, + fused_moe_lora_expand, + fused_moe_lora_shrink, + lora_expand, + lora_shrink, + ) + from vllm.lora.ops.triton_ops.fused_moe_lora_op import ( + _LORA_PTR_DICT, ## added _LORA_PTR_DICT for fused_moe_lora + ) from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT - -from vllm.utils import FlexibleArgumentParser +from vllm import _custom_ops as ops +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.math_utils import round_up DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_TP_SIZES = [1] @@ -58,6 +70,8 @@ DEFAULT_NUM_LORAS = [1, 2, 3, 4] DEFAULT_SORT_BY_LORA_IDS = [False, True] DEFAULT_SEQ_LENGTHS = [1] DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False] +DEFAULT_TOP_K_NUMS = [1] # Added for MoE LoRA top_k +DEFAULT_NUM_EXPERTS = [8] # Added for MoE LoRA num_experts # Utilities @@ -158,7 +172,7 @@ def ref_group_gemm( seq_lens_cpu: torch.Tensor, prompt_lora_mapping_cpu: torch.Tensor, scaling: float, - add_inputs: Optional[bool], + add_inputs: bool | None, ): """ Torch group gemm reference implementation to test correctness of @@ -190,6 +204,11 @@ class OpType(Enum): LORA_SHRINK = auto() LORA_EXPAND = auto() + ## Adding support for fused moe lora + FUSED_MOE_LORA_GATE_UP_SHRINK = auto() ## Gate/Up projection variant with shrink + FUSED_MOE_LORA_GATE_UP_EXPAND = auto() ## Gate/Up projection variant with expand + FUSED_MOE_LORA_DOWN_SHRINK = auto() ## Down projection variant with shrink + FUSED_MOE_LORA_DOWN_EXPAND = auto() ## Down projection variant with expand @staticmethod def from_str(s: str) -> "OpType": @@ -197,6 +216,15 @@ class OpType(Enum): return OpType.LORA_SHRINK if s.lower() == "lora_expand": return OpType.LORA_EXPAND + # Adding support for fused moe lora, both in gate_up and down + if s.lower() == "fused_moe_lora_gate_up_shrink": ## Gate/Up variant with shrink + return OpType.FUSED_MOE_LORA_GATE_UP_SHRINK + if s.lower() == "fused_moe_lora_gate_up_expand": ## Gate/Up variant with expand + return OpType.FUSED_MOE_LORA_GATE_UP_EXPAND + if s.lower() == "fused_moe_lora_down_shrink": ## Down variant with shrink + return OpType.FUSED_MOE_LORA_DOWN_SHRINK + if s.lower() == "fused_moe_lora_down_expand": ## Down variant with expand + return OpType.FUSED_MOE_LORA_DOWN_EXPAND raise ValueError(f"Unrecognized str {s} to convert to OpType") def is_shrink_fn(self) -> bool: @@ -205,19 +233,56 @@ class OpType(Enum): def is_expand_fn(self) -> bool: return self in [OpType.LORA_EXPAND] + def is_fused_moe_lora_fn(self) -> bool: ## adding for fused MoE LoRA + return self in [ + OpType.FUSED_MOE_LORA_GATE_UP_SHRINK, + OpType.FUSED_MOE_LORA_DOWN_SHRINK, + OpType.FUSED_MOE_LORA_GATE_UP_EXPAND, + OpType.FUSED_MOE_LORA_DOWN_EXPAND, + ] + + def is_fused_moe_lora_gate_up_fn( + self, + ) -> bool: ## adding for fused MoE LoRA Gate/Up + return self in [ + OpType.FUSED_MOE_LORA_GATE_UP_SHRINK, + OpType.FUSED_MOE_LORA_GATE_UP_EXPAND, + ] + + def is_fused_moe_lora_down_fn(self) -> bool: ## adding for fused MoE LoRA Down + return self in [ + OpType.FUSED_MOE_LORA_DOWN_SHRINK, + OpType.FUSED_MOE_LORA_DOWN_EXPAND, + ] + + def is_fused_moe_lora_shrink_fn(self) -> bool: + return self in [ + OpType.FUSED_MOE_LORA_GATE_UP_SHRINK, + OpType.FUSED_MOE_LORA_DOWN_SHRINK, + ] + + def is_fused_moe_lora_expand_fn(self) -> bool: + return self in [ + OpType.FUSED_MOE_LORA_GATE_UP_EXPAND, + OpType.FUSED_MOE_LORA_DOWN_EXPAND, + ] + def num_slices(self) -> list[int]: + if self.is_fused_moe_lora_gate_up_fn(): + return [2] + elif self.is_fused_moe_lora_down_fn(): + return [1] return [1, 2, 3] def mkn( self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int ) -> tuple[int, int, int]: num_tokens = batch_size * seq_length - if self.is_shrink_fn(): + if self.is_shrink_fn() or self.is_fused_moe_lora_fn(): m = num_tokens k = hidden_size n = lora_rank - else: - assert self.is_expand_fn() + elif self.is_expand_fn(): m = num_tokens k = lora_rank n = hidden_size @@ -231,9 +296,36 @@ class OpType(Enum): """ if self.is_shrink_fn(): return op_dtype, op_dtype, torch.float32 - else: - assert self.is_expand_fn() + elif self.is_expand_fn(): return torch.float32, op_dtype, op_dtype + else: + assert self.is_fused_moe_lora_fn() + return op_dtype, op_dtype, op_dtype + + def matmul_shapes_fused_moe_lora( + self, + m: int, + n: int, + k: int, + num_loras: int, + num_slices: int, + top_k_num: int, + num_experts: int, + ) -> tuple[tuple[int], tuple[int], tuple[int], tuple[int]]: + if self.is_fused_moe_lora_shrink_fn(): + input_shape = ( + (m * top_k_num, n) + if self in [OpType.FUSED_MOE_LORA_DOWN_SHRINK] + else (m, n) + ) + output_shape = (num_slices, m, top_k_num, k) + weight_shape = (num_loras, num_experts, k, n) + else: + assert self.is_fused_moe_lora_expand_fn() + input_shape = (num_slices, m, top_k_num, k) + output_shape = (m, top_k_num, n * num_slices) + weight_shape = (num_loras, num_experts, n, k) + return (input_shape, weight_shape, output_shape) def matmul_shapes( self, @@ -243,6 +335,8 @@ class OpType(Enum): lora_rank: int, num_loras: int, num_slices: int, + top_k_num: int | None = None, + num_experts: int | None = None, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: """ Given num_slices, return the shapes of the A, B, and C matrices @@ -257,6 +351,16 @@ class OpType(Enum): if self in [OpType.LORA_EXPAND]: # LoRA expand kernels support num_slices inherently in the kernel return ((num_slices, m, k), b_shape, (m, n * num_slices)) + if self.is_fused_moe_lora_fn(): + return self.matmul_shapes_fused_moe_lora( + m, + k, + n, + num_loras, + num_slices, + top_k_num, + num_experts, + ) raise ValueError(f"Unrecognized op_type {self}") def bench_fn(self) -> Callable: @@ -264,6 +368,16 @@ class OpType(Enum): return lora_shrink if self == OpType.LORA_EXPAND: return lora_expand + if self in [ + OpType.FUSED_MOE_LORA_GATE_UP_SHRINK, + OpType.FUSED_MOE_LORA_DOWN_SHRINK, + ]: + return fused_moe_lora_shrink + if self in [ + OpType.FUSED_MOE_LORA_GATE_UP_EXPAND, + OpType.FUSED_MOE_LORA_DOWN_EXPAND, + ]: + return fused_moe_lora_expand raise ValueError(f"Unrecognized optype {self}") @@ -316,8 +430,10 @@ class BenchmarkContext: lora_rank: int sort_by_lora_id: bool dtype: torch.dtype - seq_length: Optional[int] = None - num_slices: Optional[int] = None # num_slices for slice based ops + seq_length: int | None = None + num_experts: int | None = None # num_experts for MoE based ops + top_k_num: int | None = None # top_k for MoE based ops + num_slices: int | None = None # num_slices for slice based ops def with_seq_length(self, seq_length: int) -> "BenchmarkContext": ctx = copy.copy(self) @@ -372,6 +488,11 @@ class BenchmarkTensors: f"{dtype_to_str(self.output.dtype)}" ) + def get_num_tokens(self, size: int, top_k_num: int, op_type: OpType): + return ( + size * top_k_num if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK] else size + ) + @staticmethod def make( ctx: BenchmarkContext, op_type: OpType, device: str = "cuda" @@ -384,6 +505,8 @@ class BenchmarkTensors: ctx.lora_rank, ctx.num_loras, ctx.num_slices, + ctx.top_k_num, + ctx.num_experts, ) a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype) input_tensor, lora_weights, output_tensor = make_rand_tensors( @@ -431,17 +554,27 @@ class BenchmarkTensors: prompt_lora_indices_tensor, ) - def sanity_check(self) -> None: + def sanity_check(self, ctx: BenchmarkContext, op_type: OpType) -> None: """ Fails asserts when non-conformality is detected. """ - num_tokens = self.input.shape[-2] + num_tokens = ( + self.input.shape[1] + if op_type.is_fused_moe_lora_expand_fn() + else self.input.shape[-2] + ) # check metadata tensors - assert torch.sum(self.seq_lens) == num_tokens + ## In down shrink case, each token is repeated top_k_num times + assert num_tokens == self.get_num_tokens( + torch.sum(self.seq_lens), ctx.top_k_num, op_type + ), f"Expected {num_tokens} tokens, but got {torch.sum(self.seq_lens)}" num_seqs = self.seq_lens.shape[0] # assert self.seq_start_loc.shape[0] == num_seqs + ## In down shrink case, each prompt corresponds to top_k_num sequences assert self.prompt_lora_mapping.shape[0] == num_seqs - assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens + assert self.get_num_tokens( + self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type + ) def to_device(self, device: str): """ @@ -470,21 +603,111 @@ class BenchmarkTensors: to_device(field) if field_name != "no_lora_flag_cpu" else field, ) - def metadata(self) -> tuple[int, int, int]: + def metadata(self, ctx: BenchmarkContext, op_type: OpType) -> tuple[int, int, int]: """ Return num_seqs, num_tokens and max_seq_len """ num_seqs = self.seq_lens.shape[0] - num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0] + num_tokens = self.get_num_tokens( + self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type + ) max_seq_len = torch.max(self.seq_lens).item() num_slices = len(self.lora_weights_lst) return num_seqs, num_tokens, max_seq_len, num_slices - def as_lora_shrink_kwargs(self) -> dict[str, Any]: - self.sanity_check() + def fused_moe_lora_data_prepare( + self, + block_size: int, + token_lora_mapping: torch.Tensor, + ctx: BenchmarkContext, + ): + def moe_lora_align_block_size( + topk_ids: torch.Tensor, + token_lora_mapping: torch.Tensor, + block_size: int, + num_experts: int, + max_loras: int, + expert_map: torch.Tensor | None = None, + pad_sorted_ids: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Aligns tokens and experts into block-sized chunks for LoRA-based + mixture-of-experts (MoE) execution. + """ + max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) + if pad_sorted_ids: + max_num_tokens_padded = round_up(max_num_tokens_padded, block_size) + sorted_ids = torch.empty( + (max_loras * max_num_tokens_padded,), + dtype=torch.int32, + device=topk_ids.device, + ) + max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size) + # Expert ids must be set default to -1 to prevent a blank block + expert_ids = torch.empty( + (max_loras * max_num_m_blocks,), + dtype=torch.int32, + device=topk_ids.device, + ) + num_tokens_post_pad = torch.empty( + (max_loras), dtype=torch.int32, device=topk_ids.device + ) + + ops.moe_lora_align_block_size( + topk_ids, + token_lora_mapping, + num_experts, + block_size, + max_loras, + max_num_tokens_padded, + max_num_m_blocks, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) + if expert_map is not None: + expert_ids = expert_map[expert_ids] + + return sorted_ids, expert_ids, num_tokens_post_pad + + num_tokens = ctx.batch_size + curr_topk_ids = torch.randint( + 0, + ctx.num_experts, + (num_tokens, ctx.top_k_num), + device="cuda", + dtype=torch.int32, + ) + topk_weights = torch.randint( + 0, + ctx.num_experts, + (num_tokens, ctx.top_k_num), + device="cuda", + dtype=torch.int32, + ) + + (sorted_token_ids_lora, expert_ids_lora, num_tokens_post_padded_lora) = ( + moe_lora_align_block_size( + topk_ids=curr_topk_ids, + token_lora_mapping=token_lora_mapping, + block_size=block_size, + num_experts=ctx.num_experts, + max_loras=ctx.num_loras, + ) + ) + + sorted_token_ids = sorted_token_ids_lora.view(ctx.num_loras, -1) + expert_ids = expert_ids_lora.view(ctx.num_loras, -1) + num_tokens_post_padded = num_tokens_post_padded_lora + return (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) + + def as_lora_shrink_kwargs( + self, ctx: BenchmarkContext, op_type: OpType + ) -> dict[str, Any]: + self.sanity_check(ctx, op_type) self.to_device(self.input.device) - _, num_tokens, _, num_slices = self.metadata() + _, num_tokens, _, num_slices = self.metadata(ctx, op_type) # Sanity check matrix shapes. i_shape, lw_shape, o_shape = ( @@ -519,11 +742,13 @@ class BenchmarkTensors: "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu, } - def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: - self.sanity_check() + def as_lora_expand_kwargs( + self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool + ) -> dict[str, Any]: + self.sanity_check(ctx, op_type) self.to_device(self.input.device) - _, num_tokens, _, num_slices = self.metadata() + _, num_tokens, _, num_slices = self.metadata(ctx, op_type) # Sanity check matrix shapes. i_shape, lw_shape, o_shape = ( @@ -560,22 +785,177 @@ class BenchmarkTensors: "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu, } + def as_fused_moe_lora_shrink_kwargs( + self, ctx: BenchmarkContext, op_type: OpType + ) -> dict[str, Any]: + self.sanity_check(ctx, op_type) + self.to_device(self.input.device) + + _, num_tokens, _, num_slices = self.metadata(ctx, op_type) + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = ( + self.input.shape, + self.lora_weights_lst[0].shape, + self.output.shape, + ) + # Expected input shape : [num_tokens, hidden_size] for gate_up + # Expected input shape : [top_k_num * num_tokens, hidden_size] for down + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + hidden_size = i_shape[1] + # Expected lora weight shape [max_lora, num_experts, lora_rank, hidden_size] + assert len(lw_shape) == 4 + assert lw_shape[-1] == hidden_size + lora_rank = lw_shape[-2] + # Expected output shape : [num_slices, num_tokens, top_k_num, lora_rank] + assert len(o_shape) == 4 + assert ( + o_shape + == (num_slices, num_tokens // ctx.top_k_num, ctx.top_k_num, lora_rank) + if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK] + else o_shape == (num_slices, num_tokens, ctx.top_k_num, lora_rank) + ) + kernel_config = get_lora_op_configs( + op_type.name.lower(), + max_loras=lw_shape[0], + batch=num_tokens, + hidden_size=hidden_size, + rank=lora_rank, + num_slices=num_slices, + add_inputs=False, + ) + + (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = ( + self.fused_moe_lora_data_prepare( + block_size=kernel_config["BLOCK_SIZE_M"], + token_lora_mapping=self.lora_kernel_meta.token_lora_mapping, + ctx=ctx, + ) + ) + + return { + "qcurr_hidden_states": self.input, + "lora_a_stacked": self.lora_weights_lst, + "a_intermediate_cache1": self.output, + "topk_weights": topk_weights, + "sorted_token_ids": sorted_token_ids, + "expert_ids": expert_ids, + "num_tokens_post_padded": num_tokens_post_padded, + "top_k_num": ctx.top_k_num, + "device": self.input.device, + "N": lora_rank, + "M": topk_weights.shape[0], + "EM": sorted_token_ids.shape[1], + "K": self.input.shape[1], + "num_tokens": num_tokens, + "num_experts": ctx.num_experts, + "num_slices": num_slices, + "shrink_block_size_m": kernel_config["BLOCK_SIZE_M"], + "shrink_block_size_n": kernel_config["BLOCK_SIZE_N"], + "shrink_block_size_k": kernel_config["BLOCK_SIZE_K"], + "shrink_group_size_m": kernel_config["GROUP_SIZE_M"], + "shrink_num_warps": kernel_config["NUM_WARPS"], + "shrink_num_stages": kernel_config["NUM_STAGES"], + "shrink_split_k": kernel_config.get("SPLIT_K", 1), + "mul_routed_weight": op_type.is_fused_moe_lora_down_fn(), + } + + def as_fused_moe_lora_expand_kwargs( + self, ctx: BenchmarkContext, op_type: OpType + ) -> dict[str, Any]: + self.sanity_check(ctx, op_type) + self.to_device(self.input.device) + + _, num_tokens, _, num_slices = self.metadata(ctx, op_type) + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = ( + self.input.shape, + self.lora_weights_lst[0].shape, + self.output.shape, + ) + + # Expected input shape : [num_slices, num_tokens, top_k_num, lora_rank] + assert len(i_shape) == 4 + assert i_shape[0] == num_slices + assert i_shape[1] == num_tokens + lora_rank = i_shape[-1] + # Expected lora weight shape : [num_loras, num_experts, hidden_size, lora_rank] + assert len(lw_shape) == 4 + assert lw_shape[-1] == lora_rank + hidden_size = lw_shape[-2] + # Expected output shape : [num_tokens, top_k_num, hidden_size * num_slices] + assert len(o_shape) == 3 + assert o_shape == (num_tokens, ctx.top_k_num, hidden_size * num_slices) + + kernel_config = get_lora_op_configs( + op_type.name.lower(), + max_loras=lw_shape[0], + batch=num_tokens, + hidden_size=hidden_size, + rank=lora_rank, + num_slices=num_slices, + add_inputs=False, + ) + + (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = ( + self.fused_moe_lora_data_prepare( + block_size=kernel_config["BLOCK_SIZE_M"], + token_lora_mapping=self.lora_kernel_meta.token_lora_mapping, + ctx=ctx, + ) + ) + + return { + "a_intermediate_cache1": self.input, + "lora_b_stacked": self.lora_weights_lst, + "output": self.output, + "topk_weights": topk_weights, + "sorted_token_ids": sorted_token_ids, + "expert_ids": expert_ids, + "num_tokens_post_padded": num_tokens_post_padded, + "top_k_num": ctx.top_k_num, + "device": self.input.device, + "N": lora_rank, + "M": topk_weights.shape[0], + "EM": sorted_token_ids.shape[1], + "K": self.input.shape[1], + "num_tokens": num_tokens, + "num_experts": ctx.num_experts, + "num_slices": num_slices, + "max_lora_rank": lora_rank, + "w1_output_dim_size": lw_shape[2], + "expand_block_size_m": kernel_config["BLOCK_SIZE_M"], + "expand_block_size_n": kernel_config["BLOCK_SIZE_N"], + "expand_block_size_k": kernel_config["BLOCK_SIZE_K"], + "expand_group_size_m": kernel_config["GROUP_SIZE_M"], + "expand_num_warps": kernel_config["NUM_WARPS"], + "expand_num_stages": kernel_config["NUM_STAGES"], + "expand_split_k": kernel_config.get("SPLIT_K", 1), + "mul_routed_weight": op_type.is_fused_moe_lora_down_fn(), + } + def bench_fn_kwargs( - self, op_type: OpType, add_inputs: Optional[bool] = None + self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool | None = None ) -> dict[str, Any]: - if op_type.is_shrink_fn(): + if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn(): assert add_inputs is None else: assert add_inputs is not None if op_type == OpType.LORA_SHRINK: - return self.as_lora_shrink_kwargs() + return self.as_lora_shrink_kwargs(ctx, op_type) if op_type == OpType.LORA_EXPAND: - return self.as_lora_expand_kwargs(add_inputs) + return self.as_lora_expand_kwargs(ctx, op_type, add_inputs) + if op_type.is_fused_moe_lora_shrink_fn(): + return self.as_fused_moe_lora_shrink_kwargs(ctx, op_type) + if op_type.is_fused_moe_lora_expand_fn(): + return self.as_fused_moe_lora_expand_kwargs(ctx, op_type) raise ValueError(f"Unrecognized optype {self}") def test_correctness( - self, op_type: OpType, expand_fn_add_inputs: Optional[bool] + self, op_type: OpType, expand_fn_add_inputs: bool | None ) -> bool: """ Test correctness of op_type implementation against a grouped gemm @@ -611,12 +991,12 @@ def bench_optype( ctx: BenchmarkContext, arg_pool_size: int, op_type: OpType, - cuda_graph_nops: Optional[int] = None, - expand_fn_add_inputs: Optional[bool] = None, + cuda_graph_nops: int | None = None, + expand_fn_add_inputs: bool | None = None, test_correctness: bool = False, ) -> TMeasurement: assert arg_pool_size >= 1 - if op_type.is_shrink_fn(): + if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn(): assert expand_fn_add_inputs is None else: assert expand_fn_add_inputs is not None @@ -626,23 +1006,30 @@ def bench_optype( BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size) ] for bt in bench_tensors: - bt.sanity_check() + bt.sanity_check(ctx, op_type) # Test correctness of our implementation. if test_correctness: + assert op_type in [OpType.LORA_SHRINK, OpType.LORA_EXPAND], ( + f"Correctness testing is not supported for {op_type.name}." + ) assert all( - [bt.test_correctness(op_type, expand_fn_add_inputs) for bt in bench_tensors] + [ + bt.test_correctness(ctx, op_type, expand_fn_add_inputs) + for bt in bench_tensors + ] ) # BenchmarkTensors -> dict (kwargs) kwargs_list = [ - bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs) + bt.bench_fn_kwargs(ctx, op_type, add_inputs=expand_fn_add_inputs) for bt in bench_tensors ] # Clear LoRA optimization hash-maps. _LORA_A_PTR_DICT.clear() _LORA_B_PTR_DICT.clear() + _LORA_PTR_DICT.clear() # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up for kwargs in kwargs_list: op_type.bench_fn()(**kwargs) @@ -679,7 +1066,7 @@ def bench_torch_mm( ctx: BenchmarkContext, arg_pool_size: int, op_type: OpType, - cuda_graph_nops: Optional[int] = None, + cuda_graph_nops: int | None = None, ) -> TMeasurement: """ Benchmark basic torch.mm as a roofline. @@ -744,7 +1131,7 @@ def use_cuda_graph_recommendation() -> str: """ -def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None): +def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None): compare = TBenchmark.Compare(timers) compare.print() @@ -792,7 +1179,9 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]): # Benchmark bench_op expand_fn_add_inputs = ( - [None] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs + [None] + if bench_op.is_shrink_fn() or bench_op.is_fused_moe_lora_fn() + else args.expand_fn_add_inputs ) for add_input_arg in expand_fn_add_inputs: seq_len_timers.append( @@ -830,12 +1219,22 @@ def as_benchmark_contexts( hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace ) -> list[BenchmarkContext]: ctxs: list[BenchmarkContext] = [] - for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa + for ( + batch_size, + hidden_size, + lora_rank, + num_loras, + sort_by_lora_id, + top_k_num, + num_experts, + ) in product( # noqa args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras, args.sort_by_lora_id, + args.top_k_nums, + args.num_experts, ): ctxs.append( BenchmarkContext( @@ -850,6 +1249,8 @@ def as_benchmark_contexts( seq_length=None, sort_by_lora_id=sort_by_lora_id, dtype=args.dtype, + top_k_num=top_k_num, + num_experts=num_experts, # To be filled based on the OpType to benchmark num_slices=None, ) @@ -1011,6 +1412,22 @@ if __name__ == "__main__": ), ) + p.add_argument( + "--top-k-nums", + nargs="+", + type=int, + default=DEFAULT_TOP_K_NUMS, + help="Top-K values for MoE LoRA operations", + ) + + p.add_argument( + "--num-experts", + nargs="+", + type=int, + default=DEFAULT_NUM_EXPERTS, + help="Number of experts for MoE LoRA operations", + ) + parser = FlexibleArgumentParser( description=f""" Benchmark LoRA kernels: diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 1b1c3b321cce44d61ec6fb3e0f97f22950956978..8787724d77cfb989ef0d401f5dcca58486e02cbe 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -8,10 +8,9 @@ import math import os import pickle as pkl import time -from collections.abc import Iterable +from collections.abc import Callable, Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Optional import pandas as pd import torch @@ -34,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( quantize_weights, ) from vllm.scalar_type import ScalarType, scalar_types -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024] @@ -63,23 +62,23 @@ class BenchmarkTensors: a: torch.Tensor w_q: torch.Tensor - group_size: Optional[int] + group_size: int | None wtype: ScalarType w_g_s: torch.Tensor - w_g_zp: Optional[torch.Tensor] - w_ch_s: Optional[torch.Tensor] - w_tok_s: Optional[torch.Tensor] + w_g_zp: torch.Tensor | None + w_ch_s: torch.Tensor | None + w_tok_s: torch.Tensor | None @dataclass class TypeConfig: act_type: torch.dtype weight_type: ScalarType - output_type: Optional[torch.dtype] - group_scale_type: Optional[torch.dtype] - group_zero_type: Optional[torch.dtype] - channel_scale_type: Optional[torch.dtype] - token_scale_type: Optional[torch.dtype] + output_type: torch.dtype | None + group_scale_type: torch.dtype | None + group_zero_type: torch.dtype | None + channel_scale_type: torch.dtype | None + token_scale_type: torch.dtype | None def rand_data(shape, dtype=torch.float16, scale=1): @@ -93,8 +92,8 @@ def quantize_and_pack( atype: torch.dtype, w: torch.Tensor, wtype: ScalarType, - stype: Optional[torch.dtype], - group_size: Optional[int], + stype: torch.dtype | None, + group_size: int | None, zero_points: bool = False, ): assert wtype.is_integer(), "TODO: support floating point weights" @@ -113,7 +112,7 @@ def quantize_and_pack( def create_bench_tensors( - shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int] + shape: tuple[int, int, int], types: TypeConfig, group_size: int | None ) -> list[BenchmarkTensors]: m, n, k = shape @@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable]) return res -_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None -_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None +_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None +_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None def bench( diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 34cc45e94d76d930013c9d60ca8c5f78dd0b7254..12ca9214b1f9583fb22df68b9fc70f7872a58f23 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( sort_weights, ) from vllm.scalar_type import ScalarType, scalar_types -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192] diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 02c2db674d4b07f1a8871d1572959e5aac01940c..c99951aa278262e8c4aa1fabf1df2541e6c500cc 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config from vllm.triton_utils import triton -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser FP8_DTYPE = current_platform.fp8_dtype() @@ -211,7 +211,7 @@ def get_rocm_tuning_space(use_fp16): num_warps_range = [1, 2, 4, 8] group_m_range = [1, 4, 8, 16, 32] num_stage_range = [2] - waves_per_eu_range = [0] + waves_per_eu_range = [0, 1, 2, 4] matrix_instr_nonkdim_range = [16, 32] if use_fp16 else [] kpack_range = [1, 2] if use_fp16 else [] @@ -579,19 +579,23 @@ def main(args: argparse.Namespace): E = config.ffn_config.moe_num_experts topk = config.ffn_config.moe_top_k intermediate_size = config.ffn_config.ffn_hidden_size + hidden_size = config.hidden_size elif config.architectures[0] == "JambaForCausalLM": E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.intermediate_size + hidden_size = config.hidden_size elif config.architectures[0] in ( "DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM", "DeepseekV32ForCausalLM", "Glm4MoeForCausalLM", + "NemotronHForCausalLM", ): E = config.n_routed_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size + hidden_size = config.hidden_size elif config.architectures[0] in ( "Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM", @@ -600,10 +604,23 @@ def main(args: argparse.Namespace): E = config.num_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size + hidden_size = config.hidden_size + elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration": + text_config = config.get_text_config() + E = text_config.num_experts + topk = text_config.num_experts_per_tok + intermediate_size = text_config.moe_intermediate_size + hidden_size = text_config.hidden_size elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"): E = config.num_experts topk = config.moe_topk[0] intermediate_size = config.moe_intermediate_size[0] + hidden_size = config.hidden_size + elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]: + E = config.thinker_config.text_config.num_experts + topk = config.thinker_config.text_config.num_experts_per_tok + intermediate_size = config.thinker_config.text_config.moe_intermediate_size + hidden_size = config.thinker_config.text_config.hidden_size else: # Support for llama4 config = config.get_text_config() @@ -611,6 +628,7 @@ def main(args: argparse.Namespace): E = config.num_local_experts topk = config.num_experts_per_tok intermediate_size = config.intermediate_size + hidden_size = config.hidden_size enable_ep = bool(args.enable_expert_parallel) if enable_ep: ensure_divisibility(E, args.tp_size, "Number of experts") @@ -619,8 +637,7 @@ def main(args: argparse.Namespace): else: ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") shard_intermediate_size = 2 * intermediate_size // args.tp_size - hidden_size = config.hidden_size - dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" block_quant_shape = get_weight_block_size_safety(config) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 04d2205aa3722c5fe056c204f7cc0c28d46d6d6b..efa5a7386027e0f140dc6c79659c61c86c974f4d 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( ) from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser FP8_DTYPE = current_platform.fp8_dtype() @@ -344,7 +344,7 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok hidden_size = config.hidden_size - dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" use_customized_permute = args.use_customized_permute diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index b9147361708fd09a8d2da852eab08fbc68aa3bfd..cb848d2bf579e8185f2c63192ae64b57204f2a15 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -39,7 +39,7 @@ import torch from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser device = torch.device("cuda" if torch.cuda.is_available() else "cpu") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 7e0376c18ecc79690ca49e3995258c5e749bbac7..46ab2a5fe5e98fa05eead6989d924af102e79771 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -3,16 +3,15 @@ import random import time -from typing import Optional import torch from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import ( +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, - FlexibleArgumentParser, create_kv_caches_with_random, ) @@ -37,7 +36,7 @@ def main( seed: int, do_profile: bool, device: str = "cuda", - kv_cache_dtype: Optional[str] = None, + kv_cache_dtype: str | None = None, ) -> None: current_platform.seed_everything(seed) diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py index 1ccb5e08b3d57b0d66ed72061941830640cb32d8..bdc1eb733084e165ae73c9fb5c6b4d5185056fe2 100644 --- a/benchmarks/kernels/benchmark_per_token_group_quant.py +++ b/benchmarks/kernels/benchmark_per_token_group_quant.py @@ -3,8 +3,8 @@ import argparse import math +from collections.abc import Callable from contextlib import contextmanager -from typing import Callable from unittest.mock import patch import torch diff --git a/benchmarks/kernels/benchmark_polynorm.py b/benchmarks/kernels/benchmark_polynorm.py deleted file mode 100644 index 9ac8f5e6594e4562981576c41a693d8825f90e77..0000000000000000000000000000000000000000 --- a/benchmarks/kernels/benchmark_polynorm.py +++ /dev/null @@ -1,155 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import itertools - -import torch - -from vllm import _custom_ops as vllm_ops -from vllm.triton_utils import triton - - -def polynorm_naive( - x: torch.Tensor, - weight: torch.Tensor, - bias: torch.Tensor, - eps: float = 1e-6, -): - orig_shape = x.shape - x = x.view(-1, x.shape[-1]) - - def norm(x, eps: float): - return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps) - - x = x.float() - return ( - ( - weight[0] * norm(x**3, eps) - + weight[1] * norm(x**2, eps) - + weight[2] * norm(x, eps) - + bias - ) - .to(weight.dtype) - .view(orig_shape) - ) - - -def polynorm_vllm( - x: torch.Tensor, - weight: torch.Tensor, - bias: torch.Tensor, - eps: float = 1e-6, -): - orig_shape = x.shape - x = x.view(-1, x.shape[-1]) - - out = torch.empty_like(x) - vllm_ops.poly_norm(out, x, weight, bias, eps) - output = out - - output = output.view(orig_shape) - return output - - -def calculate_diff(batch_size, seq_len, hidden_dim): - dtype = torch.bfloat16 - x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda") - weight = torch.ones(3, dtype=dtype, device="cuda") - bias = torch.ones(1, dtype=dtype, device="cuda") - - output_naive = polynorm_naive(x, weight, bias) - output_vllm = polynorm_vllm(x, weight, bias) - - if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2): - print("✅ All implementations match") - else: - print("❌ Implementations differ") - - -batch_size_range = [2**i for i in range(0, 7, 2)] -seq_length_range = [2**i for i in range(6, 11, 1)] -dim_range = [2048, 4096] -configs = list(itertools.product(dim_range, batch_size_range, seq_length_range)) - - -def get_benchmark(): - @triton.testing.perf_report( - triton.testing.Benchmark( - x_names=["dim", "batch_size", "seq_len"], - x_vals=[list(_) for _ in configs], - line_arg="provider", - line_vals=["naive", "vllm"], - line_names=["Naive", "vLLM"], - styles=[("blue", "-"), ("red", "-")], - ylabel="us", - plot_name="polynorm-perf", - args={}, - ) - ) - def benchmark(dim, batch_size, seq_len, provider): - dtype = torch.bfloat16 - hidden_dim = dim * 4 - - x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda") - weight = torch.ones(3, dtype=dtype, device="cuda") - bias = torch.ones(1, dtype=dtype, device="cuda") - - quantiles = [0.5, 0.2, 0.8] - - if provider == "naive": - ms, min_ms, max_ms = triton.testing.do_bench( - lambda: polynorm_naive(x, weight, bias), - quantiles=quantiles, - ) - else: - ms, min_ms, max_ms = triton.testing.do_bench( - lambda: polynorm_vllm(x, weight, bias), - quantiles=quantiles, - ) - - return 1000 * ms, 1000 * max_ms, 1000 * min_ms - - return benchmark - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument( - "--batch-size", - type=int, - default=4, - help="Batch size", - ) - parser.add_argument( - "--seq-len", - type=int, - default=128, - help="Sequence length", - ) - parser.add_argument( - "--hidden-dim", - type=int, - default=8192, - help="Intermediate size of MLP", - ) - parser.add_argument( - "--save-path", - type=str, - default="./configs/polnorm/", - help="Path to save polnorm benchmark results", - ) - - args = parser.parse_args() - - # Run correctness test - calculate_diff( - batch_size=args.batch_size, - seq_len=args.seq_len, - hidden_dim=args.hidden_dim, - ) - - benchmark = get_benchmark() - # Run performance benchmark - benchmark.run(print_data=True, save_path=args.save_path) diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 6ab26f5f1adf73ccdc68c0d5f20d622d0c09b7ba..3c2ac9128947af0f33aad790b811bc3f56afae10 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -7,7 +7,8 @@ import torch from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE @torch.inference_mode() diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..0d3aef0c630b284feb8a8b4e3e35cdfa62237f39 --- /dev/null +++ b/benchmarks/kernels/benchmark_reshape_and_cache.py @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import random +import time + +import torch +from tabulate import tabulate + +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import ( + STR_DTYPE_TO_TORCH_DTYPE, + create_kv_caches_with_random, +) + +logger = init_logger(__name__) + + +@torch.inference_mode() +def run_benchmark( + num_tokens: int, + num_heads: int, + head_size: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + kv_cache_dtype: str, + num_iters: int, + benchmark_mode: str, + device: str = "cuda", +) -> float: + """Return latency (seconds) for given num_tokens.""" + + if kv_cache_dtype == "fp8" and head_size % 16: + raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") + + current_platform.seed_everything(42) + torch.set_default_device(device) + + # create random key / value tensors [T, H, D]. + key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device) + value = torch.randn_like(key) + + # prepare the slot mapping. + # each token is assigned a unique slot in the KV-cache. + num_slots = block_size * num_blocks + if num_tokens > num_slots: + raise ValueError("num_tokens cannot exceed the total number of cache slots") + slot_mapping_lst = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device) + + key_caches, value_caches = create_kv_caches_with_random( + num_blocks, + block_size, + 1, # num_layers + num_heads, + head_size, + kv_cache_dtype, + dtype, + device=device, + ) + key_cache, value_cache = key_caches[0], value_caches[0] + # to free unused memory + del key_caches, value_caches + + # compute per-kernel scaling factors for fp8 conversion (if used). + k_scale = (key.amax() / 64.0).to(torch.float32) + v_scale = (value.amax() / 64.0).to(torch.float32) + + function_under_test = lambda: ops.reshape_and_cache( + key, # noqa: F821 + value, # noqa: F821 + key_cache, # noqa: F821 + value_cache, # noqa: F821 + slot_mapping, # noqa: F821 + kv_cache_dtype, + k_scale, + v_scale, + ) + + if benchmark_mode == "cudagraph": + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + function_under_test() + torch.cuda.synchronize() + function_under_test = lambda: g.replay() + + def run_cuda_benchmark(n_iters: int) -> float: + nonlocal key, value, key_cache, value_cache, slot_mapping + torch.cuda.synchronize() + start = time.perf_counter() + for _ in range(n_iters): + function_under_test() + torch.cuda.synchronize() + end = time.perf_counter() + return (end - start) / n_iters + + # warm-up + run_cuda_benchmark(3) + + lat = run_cuda_benchmark(num_iters) + + # free tensors to mitigate OOM when sweeping + del key, value, key_cache, value_cache, slot_mapping + torch.cuda.empty_cache() + + return lat + + +def main(args): + rows = [] + for exp in range(1, 17): + n_tok = 2**exp + lat = run_benchmark( + num_tokens=n_tok, + num_heads=args.num_heads, + head_size=args.head_size, + block_size=args.block_size, + num_blocks=args.num_blocks, + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + kv_cache_dtype=args.kv_cache_dtype, + num_iters=args.iters, + benchmark_mode=args.mode, + device="cuda", + ) + rows.append([n_tok, lat * 1e6]) # convert to microseconds + + print(f"Benchmark results for implementation cuda (measuring with {args.mode}):") + print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f")) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + + parser.add_argument("--num-heads", type=int, default=128) + parser.add_argument( + "--head-size", + type=int, + choices=[64, 80, 96, 112, 120, 128, 192, 256], + default=128, + ) + parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) + parser.add_argument("--num-blocks", type=int, default=128 * 128) + + parser.add_argument( + "--dtype", + type=str, + choices=["half", "bfloat16", "float"], + default="bfloat16", + ) + + parser.add_argument( + "--kv-cache-dtype", + type=str, + choices=["auto", "fp8"], + default="auto", + ) + + parser.add_argument("--iters", type=int, default=200) + + parser.add_argument( + "--mode", + type=str, + choices=["cudagraph", "no_graph"], + default="cudagraph", + ) + + args = parser.parse_args() + + main(args) diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py index 0aace571064a07b4f347c7158d8a6b4e926aee87..12f17ea575d9448b1cd7ad3e900e4a86a7594703 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - import random import time @@ -14,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import ( ) from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import ( +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, - FlexibleArgumentParser, create_kv_caches_with_random_flash, ) diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index 4cf633a81358d80202fa383d7312dac4a818ea62..d8d7f5bcf9dada3b499e133ac7d7b262583fb615 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools -from typing import Optional, Union import torch from flashinfer.norm import fused_add_rmsnorm, rmsnorm @@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module): def forward( self, x: torch.Tensor, - residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: orig_dtype = x.dtype x = x.to(torch.float32) if residual is not None: @@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module): def rmsnorm_naive( x: torch.Tensor, weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, + residual: torch.Tensor | None = None, eps: float = 1e-6, ): naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps) @@ -65,7 +64,7 @@ def rmsnorm_naive( def rmsnorm_flashinfer( x: torch.Tensor, weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, + residual: torch.Tensor | None = None, eps: float = 1e-6, ): orig_shape = x.shape @@ -89,7 +88,7 @@ def rmsnorm_flashinfer( def rmsnorm_vllm( x: torch.Tensor, weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, + residual: torch.Tensor | None = None, eps: float = 1e-6, ): orig_shape = x.shape diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index b81baf17a8c674edd72fe9a6397a0b027a8d33b9..074b7a440b612215333401ce89701ed24198033c 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,98 +1,76 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from itertools import accumulate -from typing import Optional +import itertools -import nvtx import torch -from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope -from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.triton_utils import triton +from vllm.utils.argparse_utils import FlexibleArgumentParser +batch_size_range = [2**i for i in range(0, 8, 2)] +seq_len_range = [2**i for i in range(6, 10, 1)] +num_heads_range = [32, 48] +configs = list(itertools.product(batch_size_range, seq_len_range, num_heads_range)) -def benchmark_rope_kernels_multi_lora( - is_neox_style: bool, - batch_size: int, - seq_len: int, - num_heads: int, - head_size: int, - rotary_dim: Optional[int], - dtype: torch.dtype, - seed: int, - device: str, - max_position: int = 8192, - base: float = 10000, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device(device) - if rotary_dim is None: - rotary_dim = head_size - # silulating serving 4 LoRAs - scaling_factors = [1, 2, 4, 8] - # batched RoPE can take multiple scaling factors - batched_rope = get_rope( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - {"rope_type": "linear", "factor": tuple(scaling_factors)}, + +def get_benchmark(head_size, rotary_dim, is_neox_style, device): + @triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size", "seq_len", "num_heads"], + x_vals=[list(_) for _ in configs], + line_arg="provider", + line_vals=["torch", "flashinfer", "vllm"], + line_names=["PyTorch", "FlashInfer", "vLLM"], + styles=[("blue", "-"), ("green", "-"), ("red", "-")], + ylabel="us", + plot_name=f"rope-perf{'-neox-style' if is_neox_style else ''}", + args={}, + ) ) - # non-batched RoPE takes only one scaling factor, we create multiple - # instances to simulate the same behavior - non_batched_ropes: list[RotaryEmbedding] = [] - for scaling_factor in scaling_factors: - non_batched_ropes.append( - get_rope( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - {"rope_type": "linear", "factor": (scaling_factor,)}, - ) + def benchmark(batch_size, seq_len, num_heads, provider): + dtype = torch.bfloat16 + max_position = 8192 + base = 10000 + rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style) + rope = rope.to(dtype=dtype, device=device) + cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device) + + positions = torch.randint(0, max_position, (batch_size, seq_len), device=device) + query = torch.randn( + (batch_size, seq_len, num_heads * head_size), dtype=dtype, device=device ) + key = torch.randn_like(query) - positions = torch.randint(0, max_position, (batch_size, seq_len)) - query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype) - key = torch.randn_like(query) + quantiles = [0.5, 0.2, 0.8] - # create query offsets for batched RoPE, we concat multiple kv cache - # together and each query needs to find the right kv cache of its type - offset_map = torch.tensor( - list( - accumulate( - [0] - + [ - max_position * scaling_factor * 2 - for scaling_factor in scaling_factors[:-1] - ] + if provider == "torch": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: rope.forward_native(positions, query.clone(), key.clone()), + quantiles=quantiles, ) - ) - ) - query_types = torch.randint( - 0, len(scaling_factors), (batch_size, seq_len), device=device - ) - # map query types to offsets - query_offsets = offset_map[query_types] - # the kernel takes flattened offsets - flatten_offsets = query_offsets.flatten() + elif provider == "flashinfer": + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: torch.ops.vllm.flashinfer_rotary_embedding( + positions, + query.clone(), + key.clone(), + head_size, + cos_sin_cache, + is_neox_style, + ), + quantiles=quantiles, + ) + else: + ms, min_ms, max_ms = triton.testing.do_bench( + lambda: rope.forward_cuda(positions, query.clone(), key.clone()), + quantiles=quantiles, + ) + + return 1000 * ms, 1000 * max_ms, 1000 * min_ms - # batched queries of the same type together for non-batched RoPE - queries = [query[query_types == i] for i in range(len(scaling_factors))] - keys = [key[query_types == i] for i in range(len(scaling_factors))] - packed_qkr = zip(queries, keys, non_batched_ropes) - # synchronize before start timing - torch.cuda.synchronize() - with nvtx.annotate("non-batched", color="yellow"): - for q, k, r in packed_qkr: - r.forward(positions, q, k) - torch.cuda.synchronize() - with nvtx.annotate("batched", color="green"): - batched_rope.forward(positions, query, key, flatten_offsets) - torch.cuda.synchronize() + return benchmark if __name__ == "__main__": @@ -117,17 +95,12 @@ if __name__ == "__main__": parser.add_argument( "--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0" ) + parser.add_argument("--save-path", type=str, default="./configs/rope/") args = parser.parse_args() - print(args) - benchmark_rope_kernels_multi_lora( - is_neox_style=args.is_neox_style, - batch_size=args.batch_size, - seq_len=args.seq_len, - num_heads=args.num_heads, - head_size=args.head_size, - rotary_dim=args.rotary_dim, - dtype=getattr(torch, args.dtype), - seed=args.seed, - device=args.device, + # Get the benchmark function + benchmark = get_benchmark( + args.head_size, args.rotary_dim, args.is_neox_style, args.device ) + # Run performance benchmark + benchmark.run(print_data=True, save_path=args.save_path) diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py index 18c459c31d3f84d1ca7b7e31d00d50b521177e96..3e23c4cac059c04cf6a4153e9830cbec2ace36f0 100644 --- a/benchmarks/kernels/benchmark_shapes.py +++ b/benchmarks/kernels/benchmark_shapes.py @@ -78,11 +78,11 @@ WEIGHT_SHAPES = { } WEIGHT_SHAPES_MOE = { - "nm-testing/Mixtral-8x7B-Instruct-v0.1": [ + "mistralai/Mixtral-8x7B-Instruct-v0.1": [ [8, 2, 4096, 28672], [8, 2, 14336, 4096], ], - "nm-testing/deepseekv2-lite": [ + "deepseek-ai/DeepSeek-V2-Lite": [ [64, 6, 2048, 1408], ], "ibm-granite/granite-3.0-1b-a400m": [ diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py index c7a4066b39d70342db4e7bdc957e5616a6f32990..a5887aafd30d6ad9008c565dde608e9ec238fffb 100644 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -1,5 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Comprehensive 3-way SiLU Benchmark Suite + +This benchmark compares three SiLU implementations: +1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation +2. Triton Kernel - Triton-based implementation + +The suite generates detailed performance comparisons including: +- Memory bandwidth utilization +- Speedup ratios (baseline vs optimized implementations) +- Performance across different expert configurations and token distributions +""" + from collections.abc import Callable import matplotlib.pyplot as plt @@ -7,7 +21,7 @@ import numpy as np import torch from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( - silu_mul_fp8_quant_deep_gemm_cuda, + persistent_masked_m_silu_mul_quant, ) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton @@ -94,6 +108,7 @@ def silu_mul_fp8_quant_deep_gemm_triton( num_parallel_tokens, group_size: int = 128, eps: float = 1e-10, + expert_offsets: torch.Tensor = None, ) -> tuple[torch.Tensor, torch.Tensor]: """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales @@ -174,7 +189,7 @@ def silu_mul_fp8_quant_deep_gemm_triton( # Parse generation strategies -strategies = ["uniform", "max_t", "first_t"] +strategies = ["random_imbalanced", "uniform", "max_t"] def benchmark( @@ -195,15 +210,27 @@ def benchmark( current_platform.seed_everything(42 + seed_offset) y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous() - if gen_strategy == "uniform": - r = torch.rand(size=(E,), device="cuda") + if gen_strategy == "random_imbalanced": + + def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"): + mean = total_tokens // n_e + min_max = mean // ratio + e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean + e[0] = min_max + r = torch.rand(size=(E - 1,)) + r /= r.sum() + r *= total_tokens - min_max + r = r.round().long() + e[1:] = r.to(device=device) + return e + + tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda") + elif gen_strategy == "uniform": + r = torch.rand(size=(E,)) r /= r.sum() r *= total_tokens - tokens_per_expert = r.int() - tokens_per_expert = torch.minimum( - tokens_per_expert, - torch.ones((E,), device=r.device, dtype=torch.int) * T, - ) + r = r.round().long() + tokens_per_expert = r elif gen_strategy == "max_t": tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda") tokens_per_expert.fill_(total_tokens / E) @@ -281,40 +308,34 @@ def benchmark( def create_comparison_plot( - ratio, cuda_times, baseline_times, config_labels, strategy_name, id + ratios, silu_v2_times, triton_times, config_labels, strategy_name, id ): - """Create a comparison plot for a specific generation strategy""" - fig, ax = plt.subplots(1, 1, figsize=(16, 6)) + fig, ax = plt.subplots(1, 1, figsize=(18, 6)) # Configure x-axis positions x = np.arange(len(config_labels)) - width = 0.35 + width = 0.25 # Execution Time plot (lower is better) + ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue") ax.bar( - x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue" - ) - ax.bar( - x + width / 2, - baseline_times, - width, - label="Baseline", - alpha=0.8, - color="orange", + x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green" ) - # Add speedup labels over each bar pair + # Add speedup labels over each bar trio for i in range(len(x)): - speedup = ratio[i] - max_height = max(cuda_times[i], baseline_times[i]) + triton_v2_speedup = ratios[i][1] # triton/v2 + max_height = max(silu_v2_times[i], triton_times[i]) + + # Triton/V2 speedup ax.text( - x[i], + x[i] + width / 2, max_height + max_height * 0.02, - f"{speedup:.2f}x", + f"{triton_v2_speedup:.2f}x", ha="center", va="bottom", fontweight="bold", - fontsize=9, + fontsize=8, ) ax.set_xlabel("Configuration") @@ -332,56 +353,75 @@ def create_comparison_plot( def create_combined_plot(all_results): - """Create a combined plot with all strategies in one PNG""" num_strategies = len(all_results) - fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies)) + fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies)) if num_strategies == 1: axes = [axes] for idx, ( strategy_name, - ratio, - cuda_times, - baseline_times, + all_ratios, + all_silu_v2_results, + all_triton_results, config_labels, + config_x_axis, ) in enumerate(all_results): ax = axes[idx] + # Flatten the nested results to get bandwidth percentages for plotting + silu_v2_bandwidths = [] + triton_bandwidths = [] + flat_ratios = [] + + for config_results in all_silu_v2_results: + for result in config_results: + silu_v2_bandwidths.append(result[3]) # bandwidth percentage + + for config_results in all_triton_results: + for result in config_results: + triton_bandwidths.append(result[3]) # bandwidth percentage + + for config_ratios in all_ratios: + for ratio in config_ratios: + flat_ratios.append(ratio) + # Configure x-axis positions x = np.arange(len(config_labels)) - width = 0.35 + width = 0.25 - # Execution Time plot (lower is better) + # Bandwidth utilization plot (higher is better) ax.bar( - x - width / 2, - cuda_times, + x, + silu_v2_bandwidths, width, - label="CUDA Kernel", + label="SiLU V2 (CUDA)", alpha=0.8, color="blue", ) ax.bar( - x + width / 2, - baseline_times, + x + width, + triton_bandwidths, width, - label="Baseline", + label="Triton Kernel", alpha=0.8, - color="orange", + color="green", ) - # Add speedup labels over each bar pair + # Add speedup labels over each bar trio for i in range(len(x)): - speedup = ratio[i] - max_height = max(cuda_times[i], baseline_times[i]) + triton_v2_speedup = flat_ratios[i] # triton/v2 + max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i]) + + # Triton/V2 speedup ax.text( - x[i], + x[i] + width / 2, max_height + max_height * 0.02, - f"{speedup:.2f}x", + f"{triton_v2_speedup:.2f}x", ha="center", va="bottom", fontweight="bold", - fontsize=9, + fontsize=8, ) ax.set_xlabel("Configuration") @@ -395,7 +435,7 @@ def create_combined_plot(all_results): ax.grid(True, alpha=0.3) plt.tight_layout() - filename = "../../silu_bench/silu_benchmark_combined.png" + filename = "silu_benchmark_combined_3way.png" plt.savefig(filename, dpi=300, bbox_inches="tight") plt.show() @@ -405,7 +445,9 @@ def create_combined_plot(all_results): outer_dim = 7168 configs = [ # DeepSeekV3 Configs + # (1, 56, 7168), (8, 1024, 7168), + # (32, 56, 7168), # DeepSeekV3 Configs (32, 1024, 7168), # DeepSeekV3 Configs @@ -417,6 +459,7 @@ num_warmups = 20 strategy_descriptions = { "uniform": "Uniform Random", + "random_imbalanced": "Imbalanced Random", "max_t": "Even Assignment", "first_t": "experts[0] = T, experts[1:] = 0", } @@ -433,28 +476,31 @@ for id, strategy in enumerate(strategies): print(f"Testing strategy: {strategy_descriptions[strategy]}") print(f"{'=' * 60}") - # Collect benchmark data for both algorithms + # Collect benchmark data for all three algorithms config_labels = [] config_x_axis = [] - all_cuda_results = [] - all_baseline_results = [] + all_silu_v2_results = [] + all_triton_results = [] all_ratios = [] for E, T, H in configs: - total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E] + total_tokens_config = [] + for i in [8, 16, 32, 64, 128, 256, 512]: + if i <= T: + total_tokens_config.append(i * E) config_x_axis.append(total_tokens_config) - cuda_results = [] - baseline_results = [] + silu_v2_results = [] + triton_results = [] ratios = [] for total_tokens in total_tokens_config: config_label = f"E={E},T={T},H={H},TT={total_tokens}" config_labels.append(config_label) - # CUDA kernel results - time_ms_cuda, gflops, gbps, perc = benchmark( - silu_mul_fp8_quant_deep_gemm_cuda, + # SiLU V2 (CUDA kernel) results + time_ms_silu_v2, gflops, gbps, perc = benchmark( + persistent_masked_m_silu_mul_quant, E, T, H, @@ -463,9 +509,9 @@ for id, strategy in enumerate(strategies): num_warmups=num_warmups, gen_strategy=strategy, ) - cuda_results.append((time_ms_cuda, gflops, gbps, perc)) + silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc)) - # Baseline results + # Triton kernel results time_ms_triton, gflops, gbps, perc = benchmark( silu_mul_fp8_quant_deep_gemm_triton, E, @@ -476,12 +522,20 @@ for id, strategy in enumerate(strategies): num_warmups=num_warmups, gen_strategy=strategy, ) - baseline_results.append((time_ms_triton, gflops, gbps, perc)) - ratios.append(time_ms_triton / time_ms_cuda) + triton_results.append((time_ms_triton, gflops, gbps, perc)) - print(f"Completed: {config_label}") - all_cuda_results.append(cuda_results) - all_baseline_results.append(baseline_results) + # Calculate speedup ratios (triton baseline / implementation) + triton_v2_ratio = time_ms_triton / time_ms_silu_v2 + ratios.append(triton_v2_ratio) + + print( + f"Completed: {config_label}:" + f" V2: {time_ms_silu_v2:.3f}ms," + f" Triton: {time_ms_triton:.3f}ms" + ) + + all_silu_v2_results.append(silu_v2_results) + all_triton_results.append(triton_results) all_ratios.append(ratios) # Store results for combined plotting @@ -489,8 +543,8 @@ for id, strategy in enumerate(strategies): ( strategy_descriptions[strategy], all_ratios, - all_cuda_results, - all_baseline_results, + all_silu_v2_results, + all_triton_results, config_labels, config_x_axis, ) @@ -498,15 +552,18 @@ for id, strategy in enumerate(strategies): # Print summary table for this strategy print(f"\nSummary Table - {strategy_descriptions[strategy]}:") - print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}") - print("-" * 60) + print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}") + print("-" * 90) for i, (E, T, H) in enumerate(configs): - speedup = baseline_results[i][0] / cuda_results[i][0] + # Get the first result for each config (simplifying for summary) + v2_time = silu_v2_results[i][0] + triton_time = triton_results[i][0] + triton_v2_speedup = triton_time / v2_time config_label = f"E={E:3d},T={T:4d},H={H:4d}" print( - f"{config_label:<20} {cuda_results[i][0]:8.5f} " - f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x" + f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} " + f"{triton_v2_speedup:8.2f}x" ) @@ -514,15 +571,14 @@ def create_total_tokens_plot(all_results): num_strategies = len(all_results) num_configs = len(configs) - # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage fig, axs = plt.subplots( - num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies) + num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies) ) # Add main title to the entire figure fig.suptitle( - "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)", - fontsize=16, + "Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)", + fontsize=18, fontweight="bold", y=0.98, ) @@ -539,8 +595,8 @@ def create_total_tokens_plot(all_results): ( strategy_name, all_ratios, - all_cuda_results, - all_baseline_results, + all_silu_v2_results, + all_triton_results, config_labels, config_x_axis, ) = result @@ -555,42 +611,54 @@ def create_total_tokens_plot(all_results): ratios = all_ratios[config_idx] total_tokens_values = config_x_axis[config_idx] - # Extract CUDA and Triton bandwidth percentages - cuda_bandwidth_percentages = [ - result[3] for result in all_cuda_results[config_idx] + # Extract speedup ratios + triton_v2_ratios = [ratio for ratio in ratios] + + # Extract bandwidth percentages for all implementations + v2_bandwidth_percentages = [ + result[3] for result in all_silu_v2_results[config_idx] ] triton_bandwidth_percentages = [ - result[3] for result in all_baseline_results[config_idx] + result[3] for result in all_triton_results[config_idx] ] # Plot speedup ratios vs total tokens (left plot) ax_speedup.plot( - total_tokens_values, ratios, "bo-", linewidth=3, markersize=8 + total_tokens_values, + triton_v2_ratios, + "go-", + linewidth=3, + markersize=8, + label="Triton/V2 Speedup", ) ax_speedup.set_title( - f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}", + f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}", fontsize=12, fontweight="bold", ) ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11) + ax_speedup.legend(prop={"weight": "bold"}) ax_speedup.grid(True, alpha=0.3) + # Plot bandwidth utilization (right plot) ax_bandwidth.plot( total_tokens_values, - cuda_bandwidth_percentages, - "ro-", + v2_bandwidth_percentages, + "o-", linewidth=3, markersize=8, - label="CUDA", + label="SiLU V2", + color="blue", ) ax_bandwidth.plot( total_tokens_values, triton_bandwidth_percentages, - "go-", + "o-", linewidth=3, markersize=8, label="Triton", + color="green", ) ax_bandwidth.set_title( f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}", @@ -618,38 +686,12 @@ def create_total_tokens_plot(all_results): for label in ax.get_xticklabels() + ax.get_yticklabels(): label.set_fontweight("bold") - # Add value labels on speedup points - for x, y in zip(total_tokens_values, ratios): + # Add value labels on Triton/V2 speedup points + for x, y in zip(total_tokens_values, triton_v2_ratios): ax_speedup.annotate( f"{y:.2f}x", (x, y), textcoords="offset points", - xytext=(0, 12), - ha="center", - fontsize=10, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7), - ) - - # Add value labels on CUDA bandwidth points - for x, y in zip(total_tokens_values, cuda_bandwidth_percentages): - ax_bandwidth.annotate( - f"{y:.1f}%", - (x, y), - textcoords="offset points", - xytext=(0, 12), - ha="center", - fontsize=9, - fontweight="bold", - bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3), - ) - - # Add value labels on Triton bandwidth points - for x, y in zip(total_tokens_values, triton_bandwidth_percentages): - ax_bandwidth.annotate( - f"{y:.1f}%", - (x, y), - textcoords="offset points", xytext=(0, -15), ha="center", fontsize=9, @@ -659,17 +701,20 @@ def create_total_tokens_plot(all_results): plt.tight_layout() plt.subplots_adjust(top=0.93) # Make room for main title - filename = "silu_benchmark_total_tokens.png" + filename = "silu_benchmark_total_tokens_3way.png" plt.savefig(filename, dpi=300, bbox_inches="tight") plt.show() return filename -# Create combined plot with all strategies -combined_plot_filename = create_total_tokens_plot(all_results) +# Create comprehensive 3-way comparison plots +combined_plot_filename = create_combined_plot(all_results) +total_tokens_plot_filename = create_total_tokens_plot(all_results) -print(f"\n{'=' * 60}") -print("Benchmark Complete!") -print(f"Generated combined plot: {combined_plot_filename}") -print(f"{'=' * 60}") +print(f"\n{'=' * 80}") +print("3-Way Benchmark Suite Complete!") +print(f"Generated combined comparison plot: {combined_plot_filename}") +print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}") +print("Compared: SiLU V2 (CUDA), and Triton implementations") +print(f"{'=' * 80}") diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index 6ddab46214577d010498cf24f3774f27f7368475..29ce18234dfa03875e32a7644be014ebf0289f0e 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -4,12 +4,11 @@ import csv import os from datetime import datetime -from typing import Optional import flashinfer import torch -from vllm.utils import round_up +from vllm.utils.math_utils import round_up FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FP8_DTYPE = torch.float8_e4m3fn @@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn): @torch.no_grad() def benchmark_decode( dtype: torch.dtype, - quant_dtypes: tuple[ - Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] - ], + quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], batch_size: int, max_seq_len: int, num_heads: tuple[int, int] = (64, 8), diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 131df74c7de1b4c196b75e074cdd3480b978db2f..2a25d03748112ba0d3fa49b7a7449c7fe215049f 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -4,12 +4,11 @@ import csv import os from datetime import datetime -from typing import Optional import flashinfer import torch -from vllm.utils import round_up +from vllm.utils.math_utils import round_up FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FP8_DTYPE = torch.float8_e4m3fn @@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn): @torch.no_grad() def benchmark_prefill( dtype: torch.dtype, - quant_dtypes: tuple[ - Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] - ], + quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], batch_size: int, max_seq_len: int, num_heads: tuple[int, int] = (64, 8), diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index c6c8e0b0b936bcc3c78e78cae111421ba0805f4a..ab54f81985bc25ba06d820d486852a5c3b7fe4d1 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -14,11 +14,11 @@ import torch from tqdm import tqdm from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - _w8a8_block_fp8_matmul, + _w8a8_triton_block_scaled_mm, ) from vllm.platforms import current_platform from vllm.triton_utils import triton -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser mp.set_start_method("spawn", force=True) @@ -83,7 +83,7 @@ def w8a8_block_matmul( ) if A.dtype == torch.float8_e4m3fn: - kernel = _w8a8_block_fp8_matmul + kernel = _w8a8_triton_block_scaled_mm else: raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.") diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index db2398fc40a49dcf9b590d45e7ae6d4d794f0a0d..ba31bc563829858ac11aad81371133054406ee29 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# fmt: off # ruff: noqa: E501 import time @@ -9,7 +8,7 @@ import torch from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, - w8a8_block_fp8_matmul, + w8a8_triton_block_scaled_mm, ) from vllm.triton_utils import triton from vllm.utils.deep_gemm import ( @@ -20,19 +19,21 @@ from vllm.utils.deep_gemm import ( ) -def benchmark_shape(m: int, - n: int, - k: int, - warmup: int = 100, - repeat: int = 10000, - verbose: bool = False) -> dict: +def benchmark_shape( + m: int, + n: int, + k: int, + warmup: int = 100, + repeat: int = 10000, + verbose: bool = False, +) -> dict: """Benchmark all implementations for a specific (m, n, k) shape.""" if verbose: print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===") # Create test tensors - A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) - B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16) + A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) + B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16) # Reference result in BF16 torch.cuda.synchronize() @@ -49,34 +50,39 @@ def benchmark_shape(m: int, # Pre-quantize A for all implementations A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1]) A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm) - C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) + C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16) A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( - A, block_size[1], column_major_scales=True) + A, block_size[1], column_major_scales=True + ) # === DeepGEMM Implementation === def deepgemm_gemm(): - fp8_gemm_nt((A_deepgemm, A_scale_deepgemm), - (B_deepgemm, B_scale_deepgemm), - C_deepgemm) + fp8_gemm_nt( + (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm + ) return C_deepgemm # === vLLM Triton Implementation === def vllm_triton_gemm(): - return w8a8_block_fp8_matmul(A_vllm, - B_vllm, - A_scale_vllm, - B_scale_vllm, - block_size, - output_dtype=torch.bfloat16) + return w8a8_triton_block_scaled_mm( + A_vllm, + B_vllm, + A_scale_vllm, + B_scale_vllm, + block_size, + output_dtype=torch.bfloat16, + ) # === vLLM CUTLASS Implementation === def vllm_cutlass_gemm(): - return ops.cutlass_scaled_mm(A_vllm_cutlass, - B_vllm.T, - scale_a=A_scale_vllm_cutlass, - scale_b=B_scale_vllm.T, - out_dtype=torch.bfloat16) + return ops.cutlass_scaled_mm( + A_vllm_cutlass, + B_vllm.T, + scale_a=A_scale_vllm_cutlass, + scale_b=B_scale_vllm.T, + out_dtype=torch.bfloat16, + ) # Run correctness check first if verbose: @@ -93,26 +99,23 @@ def benchmark_shape(m: int, print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}") print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}") print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}") - print("vLLM Triton vs DeepGEMM difference: " - f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}") - print("vLLM CUTLASS vs DeepGEMM difference: " - f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}") + print( + "vLLM Triton vs DeepGEMM difference: " + f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}" + ) + print( + "vLLM CUTLASS vs DeepGEMM difference: " + f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}" + ) # Benchmark implementations implementations = { "DeepGEMM": deepgemm_gemm, "vLLM Triton": vllm_triton_gemm, - "vLLM CUTLASS": vllm_cutlass_gemm + "vLLM CUTLASS": vllm_cutlass_gemm, } - benchmark_results = { - "shape": { - "m": m, - "n": n, - "k": k - }, - "implementations": {} - } + benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}} for name, func in implementations.items(): # Warmup @@ -140,38 +143,36 @@ def benchmark_shape(m: int, "tflops": tflops, "gb_s": gb_s, "diff": { - "DeepGEMM": - 0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm), - "Reference": - deepgemm_diff if name == "DeepGEMM" else - (vllm_triton_diff - if name == "vLLM Triton" else vllm_cutlass_diff) - } + "DeepGEMM": 0.0 + if name == "DeepGEMM" + else calc_diff(func(), C_deepgemm), + "Reference": deepgemm_diff + if name == "DeepGEMM" + else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff), + }, } if verbose: - print( - f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s" - ) + print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s") # Calculate speedups baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"] for name, data in benchmark_results["implementations"].items(): if name != "DeepGEMM": speedup = baseline / data["time_ms"] - benchmark_results["implementations"][name][ - "speedup_vs_deepgemm"] = speedup + benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup if verbose: - print(f"DeepGEMM is {1/speedup:.2f}x " - f"{'faster' if 1/speedup > 1 else 'slower'} than {name}") + print( + f"DeepGEMM is {1 / speedup:.2f}x " + f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}" + ) - vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][ - "time_ms"] - vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][ - "time_ms"] + vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"] + vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"] cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time - benchmark_results["implementations"]["vLLM CUTLASS"][ - "speedup_vs_triton"] = cutlass_vs_triton + benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = ( + cutlass_vs_triton + ) if verbose: print( f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x " @@ -183,8 +184,7 @@ def benchmark_shape(m: int, def format_table_row(values, widths): """Format a row with specified column widths.""" - return "| " + " | ".join(f"{val:{w}}" - for val, w in zip(values, widths)) + " |" + return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |" def print_table(headers, rows, title=None): @@ -292,38 +292,50 @@ def run_benchmarks(verbose: bool = False): for result in all_results: shape = result["shape"] impl_data = result["implementations"]["DeepGEMM"] - deepgemm_rows.append([ - shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", - f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}" - ]) + deepgemm_rows.append( + [ + shape["m"], + shape["n"], + shape["k"], + f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", + f"{impl_data['gb_s']:.1f}", + ] + ) - print_table(deepgemm_headers, - deepgemm_rows, - title="DeepGEMM Implementation:") + print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:") # Print vLLM Triton table - triton_headers = [ - "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM" - ] + triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"] triton_rows = [] for result in all_results: shape = result["shape"] impl_data = result["implementations"]["vLLM Triton"] speedup = impl_data.get("speedup_vs_deepgemm", 1.0) - triton_rows.append([ - shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", - f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", - format_speedup(speedup) - ]) + triton_rows.append( + [ + shape["m"], + shape["n"], + shape["k"], + f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", + f"{impl_data['gb_s']:.1f}", + format_speedup(speedup), + ] + ) - print_table(triton_headers, - triton_rows, - title="vLLM Triton Implementation:") + print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:") # Print vLLM CUTLASS table cutlass_headers = [ - "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM", - "vs Triton" + "m", + "n", + "k", + "Time (μs)", + "TFLOPS", + "GB/s", + "vs DeepGEMM", + "vs Triton", ] cutlass_rows = [] for result in all_results: @@ -331,28 +343,27 @@ def run_benchmarks(verbose: bool = False): impl_data = result["implementations"]["vLLM CUTLASS"] vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0) vs_triton = impl_data.get("speedup_vs_triton", 1.0) - cutlass_rows.append([ - shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", - f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", - format_speedup(vs_deepgemm), - format_speedup(vs_triton) - ]) + cutlass_rows.append( + [ + shape["m"], + shape["n"], + shape["k"], + f"{impl_data['time_us']:.1f}", + f"{impl_data['tflops']:.1f}", + f"{impl_data['gb_s']:.1f}", + format_speedup(vs_deepgemm), + format_speedup(vs_triton), + ] + ) - print_table(cutlass_headers, - cutlass_rows, - title="vLLM CUTLASS Implementation:") + print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:") # Calculate and print averages print("\n===== AVERAGE PERFORMANCE =====") implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"] avg_metrics = { - impl: { - "tflops": 0, - "gb_s": 0, - "time_ms": 0 - } - for impl in implementations + impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations } for result in all_results: @@ -370,9 +381,9 @@ def run_benchmarks(verbose: bool = False): avg_tflops = avg_metrics[impl]["tflops"] / num_shapes avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes avg_time = avg_metrics[impl]["time_ms"] / num_shapes - avg_rows.append([ - impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}" - ]) + avg_rows.append( + [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"] + ) print_table(avg_headers, avg_rows) @@ -380,21 +391,19 @@ def run_benchmarks(verbose: bool = False): avg_speedups = { "DeepGEMM vs vLLM Triton": 0, "DeepGEMM vs vLLM CUTLASS": 0, - "vLLM CUTLASS vs vLLM Triton": 0 + "vLLM CUTLASS vs vLLM Triton": 0, } for result in all_results: deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"] vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"] - vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][ - "time_ms"] + vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"] - avg_speedups[ - "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time - avg_speedups[ - "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time - avg_speedups[ - "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time + avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time + avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time + avg_speedups["vLLM CUTLASS vs vLLM Triton"] += ( + vllm_triton_time / vllm_cutlass_time + ) print("\n===== AVERAGE SPEEDUPS =====") speedup_headers = ["Comparison", "Speedup"] @@ -412,8 +421,7 @@ def run_benchmarks(verbose: bool = False): for result in all_results: for impl in implementations: - avg_diff[impl] += result["implementations"][impl]["diff"][ - "Reference"] + avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"] diff_headers = ["Implementation", "Avg Diff vs Reference"] diff_rows = [] diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index 9a4da0ef5a85d0df2178d5e990ad177e3f76bf37..6964a3d3e0824d6ec93d6dff012b79cc56f7433e 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -11,7 +11,7 @@ import regex as re import seaborn as sns from torch.utils.benchmark import Measurement as TMeasurement -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser if __name__ == "__main__": parser = FlexibleArgumentParser( diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index 4bbb36bb4359259944310adb8e1047a02f126dea..a9af811bbe9ca9e0b0f66c493f27bcc890dc3515 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses -from collections.abc import Iterable -from typing import Any, Callable, Optional +from collections.abc import Callable, Iterable +from typing import Any import torch import torch.utils.benchmark as TBenchmark @@ -55,7 +55,7 @@ class Bench: def __init__( self, - cuda_graph_params: Optional[CudaGraphBenchParams], + cuda_graph_params: CudaGraphBenchParams | None, label: str, sub_label: str, description: str, diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py index 67b937930d58c2c28043eed0259b7a7b85e3f2d0..8cb8a2f386a9715c06617ec6afafc79dca3cec2f 100644 --- a/benchmarks/multi_turn/bench_dataset.py +++ b/benchmarks/multi_turn/bench_dataset.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from statistics import mean -from typing import Any, NamedTuple, Optional, Union +from typing import Any, NamedTuple import numpy as np # type: ignore import pandas as pd # type: ignore @@ -11,6 +11,7 @@ from bench_utils import ( Color, logger, ) +from tqdm import tqdm from transformers import AutoTokenizer # type: ignore # Conversation ID is a string (e.g: "UzTK34D") @@ -35,8 +36,8 @@ class Distribution(ABC): class UniformDistribution(Distribution): def __init__( self, - min_val: Union[int, float], - max_val: Union[int, float], + min_val: int | float, + max_val: int | float, is_integer: bool = True, ) -> None: self.min_val = min_val @@ -56,7 +57,7 @@ class UniformDistribution(Distribution): class ConstantDistribution(Distribution): - def __init__(self, value: Union[int, float]) -> None: + def __init__(self, value: int | float) -> None: self.value = value self.max_val = value @@ -68,7 +69,7 @@ class ConstantDistribution(Distribution): class ZipfDistribution(Distribution): - def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: + def __init__(self, alpha: float, max_val: int | None = None) -> None: self.alpha = alpha self.max_val = max_val @@ -83,7 +84,7 @@ class ZipfDistribution(Distribution): class PoissonDistribution(Distribution): - def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: + def __init__(self, alpha: float, max_val: int | None = None) -> None: self.alpha = alpha self.max_val = max_val @@ -100,11 +101,11 @@ class PoissonDistribution(Distribution): class LognormalDistribution(Distribution): def __init__( self, - mean: Optional[float] = None, - sigma: Optional[float] = None, - average: Optional[int] = None, - median_ratio: Optional[float] = None, - max_val: Optional[int] = None, + mean: float | None = None, + sigma: float | None = None, + average: int | None = None, + median_ratio: float | None = None, + max_val: int | None = None, ) -> None: self.average = average self.median_ratio = median_ratio @@ -417,6 +418,10 @@ def generate_conversations( data = file.read() tokens_in_file = tokenizer.encode(data, add_special_tokens=False) list_of_tokens.extend(tokens_in_file) + logger.info( + f"Loaded {len(tokens_in_file)} tokens from file {filename}, " + f"total tokens so far: {len(list_of_tokens)}" + ) conversations: ConversationsMap = {} conv_id = 0 @@ -449,18 +454,25 @@ def generate_conversations( ) base_offset += common_prefix_tokens - for conv_id in range(args.num_conversations): + for conv_id in tqdm( + range(args.num_conversations), + total=args.num_conversations, + desc="Generating conversations", + unit="conv", + ): # Generate a single conversation messages: MessagesList = [] nturns = turn_count[conv_id] # User prompt token count per turn (with lower limit) - input_token_count: np.ndarray = args.input_num_tokens.sample(nturns) + input_token_count: np.ndarray = args.input_num_tokens.sample(nturns).astype(int) input_token_count = np.maximum(input_token_count, base_prompt_token_count) # Assistant answer token count per turn (with lower limit) - output_token_count: np.ndarray = args.output_num_tokens.sample(nturns) + output_token_count: np.ndarray = args.output_num_tokens.sample(nturns).astype( + int + ) output_token_count = np.maximum(output_token_count, 1) user_turn = True diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index 66d85eaf513125cc5431dff9256da8a621a2a7c4..772d685ad90ff8d93822c3fca9b7fc1796a06ab1 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -13,7 +13,7 @@ from datetime import datetime from enum import Enum from http import HTTPStatus from statistics import mean -from typing import NamedTuple, Optional, Union +from typing import NamedTuple import aiohttp # type: ignore import numpy as np # type: ignore @@ -46,15 +46,16 @@ class ConversationSampling(str, Enum): class ClientArgs(NamedTuple): seed: int - max_num_requests: Optional[int] + max_num_requests: int | None skip_first_turn: bool - max_turns: Optional[int] + max_turns: int | None max_active_conversations: int verbose: bool print_content: bool verify_output: bool conversation_sampling: ConversationSampling request_rate: float + max_retries: int class RequestArgs(NamedTuple): @@ -63,6 +64,7 @@ class RequestArgs(NamedTuple): stream: bool limit_min_tokens: int # Use negative value for no limit limit_max_tokens: int # Use negative value for no limit + timeout_sec: int class BenchmarkArgs(NamedTuple): @@ -109,9 +111,9 @@ class RequestStats(NamedTuple): class MetricStats: def __init__(self) -> None: - self.min: Optional[float] = None - self.max: Optional[float] = None - self.avg: Optional[float] = None + self.min: float | None = None + self.max: float | None = None + self.avg: float | None = None self.sum = 0.0 self.count = 0 @@ -143,7 +145,7 @@ class MovingAverage: self.index = 0 self.sum = 0.0 self.count = 0 - self.avg: Optional[float] = None + self.avg: float | None = None def update(self, new_value: float) -> None: if self.count < self.window_size: @@ -169,7 +171,7 @@ class MovingAverage: class DebugStats: def __init__(self, logger: logging.Logger, window_size: int) -> None: self.logger = logger - self.metrics: dict[str, Union[MovingAverage, MetricStats]] = { + self.metrics: dict[str, MovingAverage | MetricStats] = { "moving_avg_ttft_ms": MovingAverage(window_size), "moving_avg_tpot_ms": MovingAverage(window_size), "ttft_ms": MetricStats(), @@ -198,14 +200,6 @@ class DebugStats: self.logger.info("-" * 50) -# Must support Python 3.8, we can't use str.removeprefix(prefix) -# introduced in Python 3.9 -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - return text[len(prefix) :] - return text - - def nanosec_to_millisec(value: float) -> float: return value / 1000000.0 @@ -220,8 +214,9 @@ async def send_request( chat_url: str, model: str, stream: bool = True, - min_tokens: Optional[int] = None, - max_tokens: Optional[int] = None, + min_tokens: int | None = None, + max_tokens: int | None = None, + timeout_sec: int = 120, ) -> ServerResponse: payload = { "model": model, @@ -243,16 +238,22 @@ async def send_request( headers = {"Content-Type": "application/json"} # Calculate the timeout for the request - timeout_sec = 120 if max_tokens is not None: # Assume TPOT of 200ms and use max_tokens to determine timeout - timeout_sec = max(timeout_sec, int(max_tokens * 0.2)) + token_based_timeout = int(max_tokens * 0.2) + if token_based_timeout > timeout_sec: + timeout_sec = token_based_timeout + logger.info( + "Using timeout of %ds based on max_tokens %d", + timeout_sec, + max_tokens, + ) timeout = aiohttp.ClientTimeout(total=timeout_sec) valid_response = True - ttft: Optional[float] = None + ttft: float | None = None chunk_delay: list[int] = [] - latency: Optional[float] = None + latency: float | None = None first_chunk = "" generated_text = "" @@ -269,7 +270,7 @@ async def send_request( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk == "[DONE]": # End of stream latency = time.perf_counter_ns() - start_time @@ -364,7 +365,7 @@ async def send_turn( req_args: RequestArgs, verbose: bool, verify_output: bool, -) -> Optional[RequestStats]: +) -> RequestStats | None: assert messages_to_use > 0 assert messages_to_use <= len(conversation_messages) @@ -417,6 +418,7 @@ async def send_turn( req_args.stream, min_tokens, max_tokens, + req_args.timeout_sec, ) if response.valid is False: @@ -526,6 +528,25 @@ async def poisson_sleep(request_rate: float, verbose: bool = False) -> None: await asyncio.sleep(interval) +async def exponential_backoff_sleep( + attempt_cnt: int, + base_rate: float = 1.0, + backoff_factor: float = 2.0, + jitter_fraction: float = 0.10, + verbose: bool = False, +) -> None: + # Sleep with exponential backoff and jitter after a failed request. + backoff_delay = base_rate * (backoff_factor**attempt_cnt) + jittered_delay = backoff_delay * ( + 1 + np.random.uniform(-jitter_fraction, jitter_fraction) + ) + + if verbose: + logger.info(f"Backoff for {jittered_delay:.3f} seconds...") + + await asyncio.sleep(jittered_delay) + + async def client_main( args: ClientArgs, req_args: RequestArgs, @@ -540,8 +561,11 @@ async def client_main( f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}" # noqa: E501 ) - random.seed(args.seed) - np.random.seed(args.seed) + # Set unique seed per client (each client runs in its own process) + # Add 1 to ensure no client uses the same seed as the main process + client_seed = args.seed + client_id + 1 + random.seed(client_seed) + np.random.seed(client_seed) # Active conversations active_convs: ConversationsMap = {} @@ -644,7 +668,7 @@ async def client_main( if args.verbose: curr_time_sec: float = time.perf_counter() - time_since_last_turn: Union[str, float] = "N/A" + time_since_last_turn: str | float = "N/A" if conv_id in time_of_last_turn: time_since_last_turn = round( curr_time_sec - time_of_last_turn[conv_id], 3 @@ -654,49 +678,62 @@ async def client_main( ) time_of_last_turn[conv_id] = curr_time_sec - success = True - try: - result = await send_turn( - session, - client_id, - conv_id, - messages, - current_turn, - tokenizer, - req_args, - args.print_content, - args.verify_output, - ) - if result is not None: - result_queue.put(result) - else: - # None means that the request failed, - # and should not be added to the statistics. - success = False - num_failures += 1 - - logger.warning( - f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 + success = False + for attempt_cnt in range(args.max_retries + 1): + try: + exception = False + result = await send_turn( + session, + client_id, + conv_id, + messages, + current_turn, + tokenizer, + req_args, + args.print_content, + args.verify_output, + ) + if result is not None: + result_queue.put(result) + success = True + break + else: + logger.warning( + f"{Color.YELLOW}Client {client_id} - Request rejected during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 + ) + except asyncio.exceptions.TimeoutError: + exception = True + logger.error( + "%sClient %d - Timeout during conversation ID %s (turn: %d). " + "Base timeout is %ss (set with --request-timeout-sec), but the " + "effective timeout may be longer based on max_tokens. If this " + "is unexpected, consider increasing the timeout or checking " + "model performance.%s", + Color.RED, + client_id, + conv_id, + current_turn, + req_args.timeout_sec, + Color.RESET, + ) + except Exception: + exception = True + logger.exception( + f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 ) - # Remove the conversation (should not be used again) - active_convs.pop(conv_id) - - except asyncio.exceptions.TimeoutError: - num_failures += 1 - logger.exception( - f"{Color.RED}Client {client_id} - Timeout during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 - ) - break # Exit gracefully instead of raising an error + # Sleep before retry if not last attempt + if not success and attempt_cnt < args.max_retries: + await exponential_backoff_sleep(attempt_cnt, verbose=args.verbose) - except Exception: + if not success: num_failures += 1 - logger.exception( - f"{Color.RED}Client {client_id} - Exception during conversation ID {conv_id} (turn: {current_turn}){Color.RESET}" # noqa: E501 - ) - break # Exit gracefully instead of raising an error + # Remove the conversation (should not be used again) + active_convs.pop(conv_id) + if exception: + break # Exit gracefully instead of raising an error - if success: + else: num_successes += 1 # Update the turns counter to include the LLM response @@ -769,7 +806,7 @@ def get_client_config( "Number of conversations must be equal or larger than the number of clients" ) - max_req_per_client: Optional[int] = None + max_req_per_client: int | None = None if args.max_num_requests is not None: # Max number of requests per client req_per_client = args.max_num_requests // args.num_clients @@ -811,6 +848,7 @@ def get_client_config( verify_output=args.verify_output, conversation_sampling=args.conversation_sampling, request_rate=args.request_rate, + max_retries=args.max_retries, ) if args.limit_min_tokens > 0 or args.limit_max_tokens > 0: @@ -823,6 +861,9 @@ def get_client_config( "Invalid min/max tokens limits (min should not be larger than max)" ) + if args.request_timeout_sec <= 0: + raise ValueError("Request timeout must be a positive number") + # Arguments for API requests chat_url = f"{args.url}/v1/chat/completions" model_name = args.served_model_name if args.served_model_name else args.model @@ -833,6 +874,7 @@ def get_client_config( stream=not args.no_stream, limit_min_tokens=args.limit_min_tokens, limit_max_tokens=args.limit_max_tokens, + timeout_sec=args.request_timeout_sec, ) return client_args, req_args @@ -936,13 +978,13 @@ async def main_mp( f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501 ) - rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3) + rps: str | float = round(len(client_metrics) / runtime_sec, 3) if len(client_metrics) < (5 * bench_args.num_clients): # Do not estimate the RPS if the number of samples is very low # (threshold can be tuned if needed) rps = "N/A" - runtime_left_sec: Union[str, float] = round( + runtime_left_sec: str | float = round( (runtime_sec / finished_convs) * (total_convs - finished_convs), 3 ) if percent < 0.05: @@ -976,7 +1018,7 @@ async def main_mp( f"(is alive: {client.is_alive()}){Color.RESET}" ) - client.join(timeout=120) + client.join(timeout=req_args.timeout_sec + 1) if client.is_alive(): logger.warning( @@ -1032,7 +1074,7 @@ def process_statistics( warmup_percentages: list[float], test_params: dict, verbose: bool, - gen_conv_args: Optional[GenConvArgs] = None, + gen_conv_args: GenConvArgs | None = None, excel_output: bool = False, ) -> None: if len(client_metrics) == 0: @@ -1259,7 +1301,7 @@ async def main() -> None: default=None, help="The model name used in the API. " "If not specified, the model name will be the " - "same as the ``--model`` argument. ", + "same as the `--model` argument. ", ) parser.add_argument( @@ -1342,6 +1384,16 @@ async def main() -> None: help="Expected request rate (Poisson process) per client in requests/sec." "Set to 0 for no delay between requests.", ) + parser.add_argument( + "--max-retries", + type=int, + default=int(os.environ.get("MULTITURN_BENCH_MAX_RETRIES", "0")), + help="Maximum number of retry attempts for timed-out requests. " + "Default is 0 (no retries). " + "Set to higher values to retry failed requests and maintain " + "fair workload distribution. " + "Can also be set via MULTITURN_BENCH_MAX_RETRIES environment variable.", + ) parser.add_argument( "--conversation-sampling", type=ConversationSampling, @@ -1359,6 +1411,13 @@ async def main() -> None: action="store_true", help="Verify the LLM output (compare to the answers in the input JSON file)", ) + parser.add_argument( + "--request-timeout-sec", + type=int, + default=120, + help="Timeout in seconds for each API request (default: 120). " + "Automatically increased if max tokens imply longer decoding.", + ) parser.add_argument( "--no-stream", @@ -1434,11 +1493,10 @@ async def main() -> None: f"Invalid --warmup-percentage={args.warmup_percentage}" ) from None + # Set global seeds for main process random.seed(args.seed) np.random.seed(args.seed) - if not os.path.exists(args.model): - raise OSError(f"Path does not exist: {args.model}") logger.info("Loading tokenizer") tokenizer = AutoTokenizer.from_pretrained(args.model) diff --git a/benchmarks/multi_turn/convert_sharegpt_to_openai.py b/benchmarks/multi_turn/convert_sharegpt_to_openai.py index c3622c99a2e5361c79110cdd3ca7b5bff4d78f22..fccab4d0ce21ad69980736710eee7e9814485974 100644 --- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py +++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py @@ -13,7 +13,7 @@ import argparse import json import random from statistics import mean -from typing import Any, Optional +from typing import Any import pandas as pd # type: ignore import tqdm # type: ignore @@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool: def content_is_valid( - content: str, min_content_len: Optional[int], max_content_len: Optional[int] + content: str, min_content_len: int | None, max_content_len: int | None ) -> bool: if min_content_len and len(content) < min_content_len: return False @@ -37,7 +37,7 @@ def content_is_valid( def print_stats( - conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None + conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None ) -> None: # Collect statistics stats = [] @@ -109,12 +109,12 @@ def convert_sharegpt_to_openai( seed: int, input_file: str, output_file: str, - max_items: Optional[int], - min_content_len: Optional[int] = None, - max_content_len: Optional[int] = None, - min_turns: Optional[int] = None, - max_turns: Optional[int] = None, - model: Optional[str] = None, + max_items: int | None, + min_content_len: int | None = None, + max_content_len: int | None = None, + min_turns: int | None = None, + max_turns: int | None = None, + model: str | None = None, ) -> None: if min_turns and max_turns: assert min_turns <= max_turns diff --git a/benchmarks/multi_turn/requirements.txt b/benchmarks/multi_turn/requirements.txt index f0e1935914a14b72167a6c32f93702530376d0af..bae656a5c5c4bd4f2bb544b32138cd31fd692c87 100644 --- a/benchmarks/multi_turn/requirements.txt +++ b/benchmarks/multi_turn/requirements.txt @@ -2,4 +2,5 @@ numpy>=1.24 pandas>=2.0.0 aiohttp>=3.10 transformers>=4.46 -xlsxwriter>=3.2.1 \ No newline at end of file +xlsxwriter>=3.2.1 +tqdm>=4.66 diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index 0957a9c65f06c912ae73588e017602af72ae9772..178599952d5c4e81145c73676b8ab0d4eaef6aa6 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -5,7 +5,7 @@ import cProfile import pstats from vllm import LLM, SamplingParams -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser # A very long prompt, total number of tokens is about 15k. LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000 diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml deleted file mode 100644 index 65b1e09a247e2d74e31b616c38a8544de75ad525..0000000000000000000000000000000000000000 --- a/benchmarks/pyproject.toml +++ /dev/null @@ -1,49 +0,0 @@ -# This local pyproject file is part of the migration from yapf to ruff format. -# It uses the same core rules as the main pyproject.toml file, but with the -# following differences: -# - ruff line length is overridden to 88 -# - deprecated typing ignores (UP006, UP035) have been removed - -[tool.ruff] -line-length = 88 - -[tool.ruff.lint.per-file-ignores] -"vllm/third_party/**" = ["ALL"] -"vllm/version.py" = ["F401"] -"vllm/_version.py" = ["ALL"] - -[tool.ruff.lint] -select = [ - # pycodestyle - "E", - # Pyflakes - "F", - # pyupgrade - "UP", - # flake8-bugbear - "B", - # flake8-simplify - "SIM", - # isort - "I", - # flake8-logging-format - "G", -] -ignore = [ - # star imports - "F405", "F403", - # lambda expression assignment - "E731", - # Loop control variable not used within loop body - "B007", - # f-string format - "UP032", - # Can remove once 3.10+ is the minimum Python version - "UP007", -] - -[tool.ruff.lint.isort] -known-first-party = ["vllm"] - -[tool.ruff.format] -docstring-code-format = true \ No newline at end of file diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index e6d0012c1a4be6d4a17708f1061e4bb278f68bf1..aa84125818d10d98194e50de7f3c09809093c5fa 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -15,6 +15,7 @@ endif() # set(ENABLE_AVX512BF16 $ENV{VLLM_CPU_AVX512BF16}) set(ENABLE_AVX512VNNI $ENV{VLLM_CPU_AVX512VNNI}) +set(ENABLE_AMXBF16 $ENV{VLLM_CPU_AMXBF16}) include_directories("${CMAKE_SOURCE_DIR}/csrc") @@ -140,6 +141,22 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) set(ENABLE_AVX512VNNI OFF) message(WARNING "Disable AVX512-VNNI ISA support, no avx512_vnni found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512VNNI=1.") endif() + + find_isa(${CPUINFO} "amx_bf16" AMXBF16_FOUND) + if (AMXBF16_FOUND OR ENABLE_AMXBF16) + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND + CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) + list(APPEND CXX_COMPILE_FLAGS "-mamx-bf16" "-mamx-tile") + set(ENABLE_AMXBF16 ON) + add_compile_definitions(-DCPU_CAPABILITY_AMXBF16) + else() + set(ENABLE_AMXBF16 OFF) + message(WARNING "Disable AMX_BF16 ISA support, requires gcc/g++ >= 12.3") + endif() + else() + set(ENABLE_AMXBF16 OFF) + message(WARNING "Disable AMX_BF16 ISA support, no amx_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AMXBF16=1.") + endif() elseif (AVX2_FOUND) list(APPEND CXX_COMPILE_FLAGS "-mavx2") @@ -188,31 +205,115 @@ else() message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.") endif() -# -# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms) -# Flag to enable ACL kernels for AARCH64 platforms -if (VLLM_BUILD_ACL STREQUAL "ON") - set(USE_ACL ON) -else() - set(USE_ACL OFF) -endif() +# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms) if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) - FetchContent_Declare( - oneDNN - GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.9 - GIT_PROGRESS TRUE - GIT_SHALLOW TRUE - ) + # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64 + # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN + set(ONEDNN_AARCH64_USE_ACL OFF CACHE BOOL "") + if(ASIMD_FOUND) + # Set number of parallel build processes + include(ProcessorCount) + ProcessorCount(NPROC) + if(NOT NPROC) + set(NPROC 4) + endif() + # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0) + # and create a local shim dir with it + vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR) + + find_library(OPEN_MP + NAMES gomp + PATHS ${VLLM_TORCH_GOMP_SHIM_DIR} + NO_DEFAULT_PATH + REQUIRED + ) + # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch + if (OPEN_MP) + set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}") + endif() + + # Fetch and populate ACL + if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}") + message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}") + else() + message(STATUS "Downloading Arm Compute Library (ACL) from GitHub") + FetchContent_Populate(arm_compute + SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild" + SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src" + GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git + GIT_TAG v52.6.0 + GIT_SHALLOW TRUE + GIT_PROGRESS TRUE + ) + set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}") + set(ACL_LIB_DIR "$ENV{ACL_ROOT_DIR}/build") + endif() - if(USE_ACL) - find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/) - if(NOT ARM_COMPUTE_LIBRARY) - message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR") + # Build ACL with CMake + set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF") + set(CMAKE_BUILD_TYPE "Release") + set(ARM_COMPUTE_ARCH "armv8.2-a") + set(ARM_COMPUTE_ENABLE_ASSERTS "OFF") + set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF") + set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER") + set(ARM_COMPUTE_ENABLE_OPENMP "ON") + set(ARM_COMPUTE_ENABLE_WERROR "OFF") + set(ARM_COMPUTE_BUILD_EXAMPLES "OFF") + set(ARM_COMPUTE_BUILD_TESTING "OFF") + + set(_cmake_config_cmd + ${CMAKE_COMMAND} -G Ninja -B build + -DARM_COMPUTE_BUILD_SHARED_LIB=OFF + -DCMAKE_BUILD_TYPE=Release + -DARM_COMPUTE_ARCH=armv8.2-a + -DARM_COMPUTE_ENABLE_ASSERTS=OFF + -DARM_COMPUTE_ENABLE_CPPTHREADS=OFF + -DARM_COMPUTE_ENABLE_OPENMP=ON + -DARM_COMPUTE_ENABLE_WERROR=OFF + -DARM_COMPUTE_BUILD_EXAMPLES=OFF + -DARM_COMPUTE_BUILD_TESTING=OFF) + set(_cmake_build_cmd + ${CMAKE_COMMAND} --build build -- -j${NPROC} + ) + + execute_process( + COMMAND ${_cmake_config_cmd} + WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}" + ) + execute_process( + COMMAND ${_cmake_build_cmd} + WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}" + RESULT_VARIABLE _acl_rc + ) + + if(NOT _acl_rc EQUAL 0) + message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).") endif() - set(ONEDNN_AARCH64_USE_ACL "ON") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/") + message(STATUS "Arm Compute Library (ACL) built successfully.") + + # VLLM/oneDNN settings for ACL + set(ONEDNN_AARCH64_USE_ACL ON CACHE BOOL "" FORCE) + add_compile_definitions(VLLM_USE_ACL) + endif() + + set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.") + + if(FETCHCONTENT_SOURCE_DIR_ONEDNN) + message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}") + FetchContent_Declare( + oneDNN + SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN} + ) + else() + message(STATUS "Downloading oneDNN from GitHub") + FetchContent_Declare( + oneDNN + GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git + GIT_TAG v3.10 + GIT_PROGRESS TRUE + GIT_SHALLOW TRUE + ) endif() set(ONEDNN_LIBRARY_TYPE "STATIC") @@ -229,7 +330,10 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON set(ONEDNN_VERBOSE "OFF") set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) + set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE}) + set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size FetchContent_MakeAvailable(oneDNN) + set(CMAKE_BUILD_TYPE ${VLLM_BUILD_TYPE}) add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp") target_include_directories( dnnl_ext @@ -259,14 +363,14 @@ endif() # set(VLLM_EXT_SRC "csrc/cpu/activation.cpp" - "csrc/cpu/attention.cpp" - "csrc/cpu/cache.cpp" "csrc/cpu/utils.cpp" "csrc/cpu/layernorm.cpp" "csrc/cpu/mla_decode.cpp" "csrc/cpu/pos_encoding.cpp" - "csrc/cpu/torch_bindings.cpp" - "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") + "csrc/moe/dynamic_4bit_int_moe_cpu.cpp" + "csrc/cpu/cpu_attn.cpp" + "csrc/cpu/scratchpad_manager.cpp" + "csrc/cpu/torch_bindings.cpp") if (AVX512_FOUND AND NOT AVX512_DISABLED) set(VLLM_EXT_SRC @@ -297,7 +401,7 @@ message(STATUS "CPU extension source files: ${VLLM_EXT_SRC}") # Define extension targets # -define_gpu_extension_target( +define_extension_target( _C DESTINATION vllm LANGUAGE CXX diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake index c9e7aec880b99ab2f4e785b0e94c7167fd820e15..2cf3c1a755d3c0b86623121eefb481644280c8a3 100644 --- a/cmake/external_projects/flashmla.cmake +++ b/cmake/external_projects/flashmla.cmake @@ -19,7 +19,7 @@ else() FetchContent_Declare( flashmla GIT_REPOSITORY https://github.com/vllm-project/FlashMLA - GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f + GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95 GIT_PROGRESS TRUE CONFIGURE_COMMAND "" BUILD_COMMAND "" @@ -66,6 +66,7 @@ if(FLASH_MLA_ARCHS) ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu + ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu ) set(FlashMLA_INCLUDES @@ -91,7 +92,7 @@ if(FLASH_MLA_ARCHS) SRCS "${FlashMLA_Extension_SOURCES}" CUDA_ARCHS "${FLASH_MLA_ARCHS}") - define_gpu_extension_target( + define_extension_target( _flashmla_C DESTINATION vllm LANGUAGE ${VLLM_GPU_LANG} @@ -108,7 +109,7 @@ if(FLASH_MLA_ARCHS) $<$:-UPy_LIMITED_API> $<$:-UPy_LIMITED_API>) - define_gpu_extension_target( + define_extension_target( _flashmla_extension_C DESTINATION vllm LANGUAGE ${VLLM_GPU_LANG} diff --git a/cmake/external_projects/qutlass.cmake b/cmake/external_projects/qutlass.cmake new file mode 100644 index 0000000000000000000000000000000000000000..5a59a409999ad5d673eea5b3a67648bba2c43c5e --- /dev/null +++ b/cmake/external_projects/qutlass.cmake @@ -0,0 +1,97 @@ +include(FetchContent) + +set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory") + +if(DEFINED ENV{QUTLASS_SRC_DIR}) + set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR}) +endif() + +if(QUTLASS_SRC_DIR) + FetchContent_Declare( + qutlass + SOURCE_DIR ${QUTLASS_SRC_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + ) +else() + FetchContent_Declare( + qutlass + GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git + GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65 + GIT_PROGRESS TRUE + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + ) +endif() + +FetchContent_Populate(qutlass) + +if(NOT qutlass_SOURCE_DIR) + message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.") +endif() +message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}") + +cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}") +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS) + + if(QUTLASS_ARCHS MATCHES "10\\.0a") + set(QUTLASS_TARGET_CC 100) + elseif(QUTLASS_ARCHS MATCHES "12\\.0a") + set(QUTLASS_TARGET_CC 120) + else() + message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.") + endif() + + set(QUTLASS_SOURCES + ${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp + ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu + ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu + ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu + ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu + ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu + ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu + ) + + set(QUTLASS_INCLUDES + ${qutlass_SOURCE_DIR} + ${qutlass_SOURCE_DIR}/qutlass + ${qutlass_SOURCE_DIR}/qutlass/csrc/include + ${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions + ) + + if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h") + list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}") + elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h") + list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include") + message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).") + else() + message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. " + "Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include") + endif() + + set_gencode_flags_for_srcs( + SRCS "${QUTLASS_SOURCES}" + CUDA_ARCHS "${QUTLASS_ARCHS}" + ) + + target_sources(_C PRIVATE ${QUTLASS_SOURCES}) + target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES}) + target_compile_definitions(_C PRIVATE + QUTLASS_DISABLE_PYBIND=1 + TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC} + ) + + set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS + $<$:--expt-relaxed-constexpr --use_fast_math -O3> + ) + +else() + if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8") + message(STATUS + "[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).") + else() + message(STATUS + "[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in " + "CUDA_ARCHS='${CUDA_ARCHS}'.") + endif() +endif() diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 3d32121f13ac2709924cd23e47256e6a1bf820a6..567c8959f0454f7a0f6e0fd11b119acce11165f5 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a + GIT_TAG 58e0626a692f09241182582659e3bf8f16472659 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/cmake/hipify.py b/cmake/hipify.py index 55d378f5b11137739d21beb6d80df672040f66a4..8504f9defee96bcd1d3f6eb2698c78061b41bcec 100755 --- a/cmake/hipify.py +++ b/cmake/hipify.py @@ -16,7 +16,7 @@ import shutil from torch.utils.hipify.hipify_python import hipify -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() # Project directory where all the source + include files live. @@ -34,15 +34,14 @@ if __name__ == '__main__': ) # Source files to convert. - parser.add_argument("sources", - help="Source files to hipify.", - nargs="*", - default=[]) + parser.add_argument( + "sources", help="Source files to hipify.", nargs="*", default=[] + ) args = parser.parse_args() # Limit include scope to project_dir only - includes = [os.path.join(args.project_dir, '*')] + includes = [os.path.join(args.project_dir, "*")] # Get absolute path for all source files. extra_files = [os.path.abspath(s) for s in args.sources] @@ -51,25 +50,31 @@ if __name__ == '__main__': # The directory might already exist to hold object files so we ignore that. shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True) - hipify_result = hipify(project_directory=args.project_dir, - output_directory=args.output_dir, - header_include_dirs=[], - includes=includes, - extra_files=extra_files, - show_detailed=True, - is_pytorch_extension=True, - hipify_extra_files_only=True) + hipify_result = hipify( + project_directory=args.project_dir, + output_directory=args.output_dir, + header_include_dirs=[], + includes=includes, + extra_files=extra_files, + show_detailed=True, + is_pytorch_extension=True, + hipify_extra_files_only=True, + ) hipified_sources = [] for source in args.sources: s_abs = os.path.abspath(source) - hipified_s_abs = (hipify_result[s_abs].hipified_path if - (s_abs in hipify_result - and hipify_result[s_abs].hipified_path is not None) - else s_abs) + hipified_s_abs = ( + hipify_result[s_abs].hipified_path + if ( + s_abs in hipify_result + and hipify_result[s_abs].hipified_path is not None + ) + else s_abs + ) hipified_sources.append(hipified_s_abs) - assert (len(hipified_sources) == len(args.sources)) + assert len(hipified_sources) == len(args.sources) # Print hipified source files. print("\n".join(hipified_sources)) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 10170c836594d78391c3a5f40e53b879a06a3544..7d62391469341ced1a0522161cdedbd25f1163e4 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -130,6 +130,44 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG) set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE) endfunction() +# Find libgomp that gets shipped with PyTorch wheel and create a shim dir with: +# libgomp.so -> libgomp-.so... +# libgomp.so.1 -> libgomp-.so... +# OUTPUT: TORCH_GOMP_SHIM_DIR ("" if not found) +function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR) + set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE) + + # Use run_python to locate vendored libgomp; never throw on failure. + run_python(_VLLM_TORCH_GOMP_PATH + " +import os, glob +try: + import torch + torch_pkg = os.path.dirname(torch.__file__) + site_root = os.path.dirname(torch_pkg) + torch_libs = os.path.join(site_root, 'torch.libs') + print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0]) +except: + print('') +" + "failed to probe torch.libs for libgomp") + + if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}") + return() + endif() + + # Create shim under the build tree + set(_shim "${CMAKE_BINARY_DIR}/gomp_shim") + file(MAKE_DIRECTORY "${_shim}") + + execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so") + execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1") + execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so") + execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1") + + set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE) +endfunction() + # Macro for converting a `gencode` version number to a cmake version number. macro(string_to_ver OUT_VER IN_STR) string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) @@ -311,13 +349,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR list(REMOVE_DUPLICATES _PTX_ARCHS) list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS) - # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should - # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS + # If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should + # remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS set(_CUDA_ARCHS) foreach(_arch ${_SRC_CUDA_ARCHS}) - if(_arch MATCHES "\\a$") + if(_arch MATCHES "[af]$") list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}") - string(REPLACE "a" "" _base "${_arch}") + string(REGEX REPLACE "[af]$" "" _base "${_arch}") if ("${_base}" IN_LIST TGT_CUDA_ARCHS) list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}") list(APPEND _CUDA_ARCHS "${_arch}") @@ -416,21 +454,20 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) endmacro() # -# Define a target named `GPU_MOD_NAME` for a single extension. The +# Define a target named `MOD_NAME` for a single extension. The # arguments are: # # DESTINATION - Module destination directory. -# LANGUAGE - The GPU language for this module, e.g CUDA, HIP, -# etc. +# LANGUAGE - The language for this module, e.g. CUDA, HIP, +# CXX, etc. # SOURCES - List of source files relative to CMakeLists.txt # directory. # # Optional arguments: # -# ARCHITECTURES - A list of target GPU architectures in cmake -# format. -# Refer `CMAKE_CUDA_ARCHITECTURES` documentation -# and `CMAKE_HIP_ARCHITECTURES` for more info. +# ARCHITECTURES - A list of target architectures in cmake format. +# For GPU, refer to CMAKE_CUDA_ARCHITECTURES and +# CMAKE_HIP_ARCHITECTURES for more info. # ARCHITECTURES will use cmake's defaults if # not provided. # COMPILE_FLAGS - Extra compiler flags passed to NVCC/hip. @@ -441,63 +478,61 @@ endmacro() # # Note: optimization level/debug info is set via cmake build type. # -function (define_gpu_extension_target GPU_MOD_NAME) +function (define_extension_target MOD_NAME) cmake_parse_arguments(PARSE_ARGV 1 - GPU + ARG "WITH_SOABI" "DESTINATION;LANGUAGE;USE_SABI" "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES") # Add hipify preprocessing step when building with HIP/ROCm. - if (GPU_LANGUAGE STREQUAL "HIP") - hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}") + if (ARG_LANGUAGE STREQUAL "HIP") + hipify_sources_target(ARG_SOURCES ${MOD_NAME} "${ARG_SOURCES}") endif() - if (GPU_WITH_SOABI) - set(GPU_WITH_SOABI WITH_SOABI) + if (ARG_WITH_SOABI) + set(SOABI_KEYWORD WITH_SOABI) else() - set(GPU_WITH_SOABI) + set(SOABI_KEYWORD "") endif() - if (GPU_USE_SABI) - Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}") + if (ARG_USE_SABI) + Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}") else() - Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}") + Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}") endif() - if (GPU_LANGUAGE STREQUAL "HIP") + if (ARG_LANGUAGE STREQUAL "HIP") # Make this target dependent on the hipify preprocessor step. - add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME}) + add_dependencies(${MOD_NAME} hipify${MOD_NAME}) # Make sure we include the hipified versions of the headers, and avoid conflicts with the ones in the original source folder - target_include_directories(${GPU_MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc - ${GPU_INCLUDE_DIRECTORIES}) + target_include_directories(${MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc + ${ARG_INCLUDE_DIRECTORIES}) else() - target_include_directories(${GPU_MOD_NAME} PRIVATE csrc - ${GPU_INCLUDE_DIRECTORIES}) + target_include_directories(${MOD_NAME} PRIVATE csrc + ${ARG_INCLUDE_DIRECTORIES}) endif() - if (GPU_ARCHITECTURES) - set_target_properties(${GPU_MOD_NAME} PROPERTIES - ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}") + if (ARG_ARCHITECTURES) + set_target_properties(${MOD_NAME} PROPERTIES + ${ARG_LANGUAGE}_ARCHITECTURES "${ARG_ARCHITECTURES}") endif() + target_compile_options(${MOD_NAME} PRIVATE + $<$:${ARG_COMPILE_FLAGS}>) - target_compile_options(${GPU_MOD_NAME} PRIVATE - $<$:${GPU_COMPILE_FLAGS}>) - - target_compile_definitions(${GPU_MOD_NAME} PRIVATE - "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}") + target_compile_definitions(${MOD_NAME} PRIVATE + "-DTORCH_EXTENSION_NAME=${MOD_NAME}") - - target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES}) + target_link_libraries(${MOD_NAME} PRIVATE torch ${ARG_LIBRARIES}) # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of # dependencies that are not necessary and may not be installed. - if (GPU_LANGUAGE STREQUAL "CUDA") - target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver) + if (ARG_LANGUAGE STREQUAL "CUDA") + target_link_libraries(${MOD_NAME} PRIVATE torch CUDA::cudart CUDA::cuda_driver ${ARG_LIBRARIES}) else() - target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) + target_link_libraries(${MOD_NAME} PRIVATE torch ${TORCH_LIBRARIES} ${ARG_LIBRARIES}) endif() - install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME}) -endfunction() \ No newline at end of file + install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${ARG_DESTINATION} COMPONENT ${MOD_NAME}) +endfunction() diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000000000000000000000000000000000000..304c0be8105fc2ee3596efd937d505ed9ba7d354 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,12 @@ +codecov: + require_ci_to_pass: false + +fixes: + # Map source code paths to repository root paths + # Wildcards match any Python version (python3.*) + - "/vllm-workspace/src/vllm/::vllm/" + - "/vllm-workspace/vllm/::vllm/" + - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/" + - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/" + - "/usr/lib/python3.*/dist-packages/vllm/::vllm/" + - "/usr/lib/python3.*/site-packages/vllm/::vllm/" diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh index abd38d61f8bf851ca7f151c035a101803d23c03a..35b7a56c55a15186b9276c16a7f9e8c73e5c65a6 100644 --- a/csrc/attention/attention_kernels.cuh +++ b/csrc/attention/attention_kernels.cuh @@ -28,10 +28,10 @@ #ifdef USE_ROCM #include - #include "../quantization/fp8/amd/quant_utils.cuh" + #include "../quantization/w8a8/fp8/amd/quant_utils.cuh" typedef __hip_bfloat16 __nv_bfloat16; #else - #include "../quantization/fp8/nvidia/quant_utils.cuh" + #include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh" #endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu index 6bee9e4ce1166cd922b4979d9c9a594ef8e75c44..229d9862fb670338da17a762e9b25be6a389745b 100644 --- a/csrc/attention/merge_attn_states.cu +++ b/csrc/attention/merge_attn_states.cu @@ -46,6 +46,32 @@ __global__ void merge_attn_states_kernel( s_lse = std::isinf(s_lse) ? -std::numeric_limits::infinity() : s_lse; const float max_lse = fmaxf(p_lse, s_lse); + + /* In certain edge cases, MLA can produce p_lse = s_lse = -inf; + continuing the pipeline then yields NaN. Root cause: with chunked prefill + a batch may be split into two chunks; if a request in that batch has no + prefix hit, every LSE entry for that request’s position is -inf, and at + this moment we merge cross-attention at first. For now we simply emit + prefix_output (expected to be all zeros) and prefix_lse (-inf) to fix + this problem. + */ + if (std::isinf(max_lse)) { + if (pack_offset < head_size) { + // Pack 128b load + pack_128b_t p_out_pack = reinterpret_cast( + prefix_head_ptr)[pack_offset / pack_size]; + + // Pack 128b storage + reinterpret_cast(output_head_ptr)[pack_offset / pack_size] = + p_out_pack; + } + // We only need to write to output_lse once per head. + if (output_lse != nullptr && pack_idx == 0) { + output_lse[head_idx * num_tokens + token_idx] = max_lse; + } + return; + } + p_lse = p_lse - max_lse; s_lse = s_lse - max_lse; const float p_se = expf(p_lse); diff --git a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp index 297d94dcc06314320f4c8ce3f32614a3a76d76f0..2d4b4a67d242168dc36d4da63f56a50bc36cd9c2 100644 --- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp +++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp @@ -125,32 +125,37 @@ public: } static void set_split_kv (KernelArguments& args) { - // printf("set_split_kv start"); if (args.split_kv >= 1) return; auto [H, K, D, B] = args.problem_shape; - // std::cout << H << " " << K << " " << D << " " << B << "\n"; int sm_count = args.hw_info.sm_count; - // printf(" sm_count = %d\n", sm_count); - int max_splits = ceil_div(K, 128); - max_splits = min(16, max_splits); - - // TODO: This avoids a hang when the batch size larger than 1 and - // there is more than 1 kv_splits. - // Discuss with NVIDIA how this can be fixed. - if (B > 1) { - max_splits = min(1, max_splits); + float seq_length_k = static_cast(K) / 1024.0f; + int max_splits = 1; + + if (B <= 4 && seq_length_k >= 16) { + max_splits = 16; + } + else if (B <= 8 && seq_length_k >= 4) { + max_splits = 8; + } + else if ((B <= 16 && seq_length_k >= 8) || + (B == 48 && seq_length_k >= 32)) { + max_splits = 4; + } + else if ((B <= 32 && seq_length_k >= 16) || + (B == 96 && seq_length_k >= 16)) { + max_splits = 2; } - - // printf(" max_splits = %d\n", max_splits); + else { + max_splits = 1; + } + + // Wave-aware scheduling: ensure integer number of waves in K dimension int sms_per_batch = max(1, sm_count / B); - // printf(" sms_per_batch = %d\n", sms_per_batch); int split_heur = min(max_splits, sms_per_batch); int waves = ceil_div(B * split_heur, sm_count); int k_waves = ceil_div(max_splits, split_heur); int split_wave_aware = ceil_div(max_splits, k_waves); args.split_kv = split_wave_aware; - // printf(" args.split_kv = %d\n", args.split_kv); - } /// Determines whether the GEMM can execute the given problem. diff --git a/csrc/cache.h b/csrc/cache.h index 427bd0d54fac5cded3659399b51c589b25a8e1b6..10e9bb25cf658b53fe4b0964b1ff8d3523c60fb6 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -64,3 +64,11 @@ void indexer_k_quant_and_cache( torch::Tensor& slot_mapping, // [num_tokens] int64_t quant_block_size, // quantization block size const std::string& scale_fmt); + +// Extract function to gather quantized K cache +void cp_gather_indexer_k_quant_cache( + const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride] + torch::Tensor& dst_k, // [num_tokens, head_dim] + torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4] + const torch::Tensor& block_table, // [batch_size, num_blocks] + const torch::Tensor& cu_seq_lens); // [batch_size + 1] diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 1286f5806d4b64b2381d2c00483480b24815013c..10f31a999e3b40d21dfa6e506863e525111a9d36 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -9,16 +9,17 @@ #include "quantization/vectorization_utils.cuh" #ifdef USE_ROCM - #include "quantization/fp8/amd/quant_utils.cuh" + #include "quantization/w8a8/fp8/amd/quant_utils.cuh" #else - #include "quantization/fp8/nvidia/quant_utils.cuh" + #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh" #endif #include #include -#include // FLT_MIN -#include -#include + +#include +// #include +// #include #ifdef USE_ROCM #include @@ -210,6 +211,20 @@ void copy_blocks_mla(std::vector const& kv_caches, namespace vllm { +// Used to copy/convert one element +template +struct CopyWithScaleOp { + float scale; + + __device__ __forceinline__ void operator()(OutT& dst, const InT src) const { + if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { + dst = static_cast(src); + } else { + dst = fp8::scaled_convert(src, scale); + } + } +}; + template __global__ void reshape_and_cache_kernel( const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] @@ -225,58 +240,50 @@ __global__ void reshape_and_cache_kernel( const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; if (slot_idx < 0) { - // Padding token that should be ignored. return; } const int64_t block_idx = slot_idx / block_size; const int64_t block_offset = slot_idx % block_size; + const int h_block_count = head_size / x; // head_size//x - const int n = num_heads * head_size; - for (int i = threadIdx.x; i < n; i += blockDim.x) { - const int64_t src_key_idx = token_idx * key_stride + i; - const int64_t src_value_idx = token_idx * value_stride + i; - - const int head_idx = i / head_size; - const int head_offset = i % head_size; - const int x_idx = head_offset / x; - const int x_offset = head_offset % x; - - const int64_t tgt_key_idx = - block_idx * num_heads * (head_size / x) * block_size * x + - head_idx * (head_size / x) * block_size * x + x_idx * block_size * x + - block_offset * x + x_offset; - const int64_t tgt_value_idx = - block_idx * num_heads * head_size * block_size + - head_idx * head_size * block_size + head_offset * block_size + - block_offset; - scalar_t tgt_key = key[src_key_idx]; - scalar_t tgt_value = value[src_value_idx]; - if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { - key_cache[tgt_key_idx] = tgt_key; - value_cache[tgt_value_idx] = tgt_value; - } else { - key_cache[tgt_key_idx] = - fp8::scaled_convert(tgt_key, *k_scale); - value_cache[tgt_value_idx] = - fp8::scaled_convert(tgt_value, *v_scale); - } + const int h_block_idx = threadIdx.x; + if (h_block_idx >= num_heads * h_block_count) { + return; } -} -// Used by vectorization_utils to copy/convert one element -template -struct CopyWithScaleOp { - float scale; + const int head_idx = h_block_idx / h_block_count; + const int h_block = h_block_idx % h_block_count; - __device__ __forceinline__ void operator()(OutT& dst, const InT src) const { - if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { - dst = static_cast(src); - } else { - dst = fp8::scaled_convert(src, scale); - } + const scalar_t* __restrict__ key_src = + key + token_idx * key_stride + head_idx * head_size + h_block * x; + const int64_t src_value_start = + token_idx * value_stride + head_idx * head_size + h_block * x; + + cache_t* __restrict__ key_dst = + key_cache + block_idx * num_heads * h_block_count * block_size * x + + head_idx * h_block_count * block_size * x + h_block * block_size * x + + block_offset * x; + const int64_t tgt_value_start = + block_idx * num_heads * h_block_count * x * block_size + + head_idx * h_block_count * x * block_size + h_block * x * block_size + + block_offset; + + constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4; + float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale; + CopyWithScaleOp k_op{k_scale_val}; + float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale; + CopyWithScaleOp v_op{v_scale_val}; + + vectorize_with_alignment(key_src, key_dst, x, 0, 1, k_op); + + const scalar_t* __restrict__ value_src = value + src_value_start; + cache_t* __restrict__ value_dst = value_cache + tgt_value_start; +#pragma unroll + for (int i = 0; i < x; i++) { + v_op(value_dst[i * block_size], value_src[i]); } -}; +} template __global__ void reshape_and_cache_flash_kernel( @@ -424,84 +431,81 @@ __global__ void concat_and_cache_ds_mla_kernel( const int64_t dst_idx_start = block_idx * block_stride + block_offset * entry_stride; - // Create 4 tile scales in shared memory - __shared__ float smem[20]; - float* shard_abs_max = smem; - float* tile_scales = smem + 16; - - // For the NoPE part, each tile of 128 elements is handled by 4 warps - // (128 threads). There are 4 total tiles, so 16 warps (512 threads). - // The first thread of the first warp in each tile writes the scale - // value for the tile. The RoPE part (last 64 elements) is handled - // by another 2 warps (64 threads). - // So in total, we use 18 warps (576 threads) per block. + // For the NoPE part, each tile of 128 elements is handled by half of one warp + // (16 threads). There are 4 total tiles, so 2 warps (64 threads). + // Lanes 0 and 16 of each warp write the scale values for that warp's tiles. + // The RoPE part (last 64 elements) is handled by another 1 warp (32 threads). + // So in total, we use 3 warps (96 threads) per block. // Cast kv_cache to 16_bit for RoPE values scalar_t* kv_cache_16bit = reinterpret_cast(&kv_cache[dst_idx_start]); - // The last 64 threads handle the RoPE part - if (threadIdx.x >= kv_lora_rank) { - const int8_t pe_idx = threadIdx.x - kv_lora_rank; - const int64_t src_idx = token_idx * k_pe_stride + pe_idx; + // The last warp handles the RoPE part + if (threadIdx.x >= 64) { + // Each thread handles two elements of RoPE + const int8_t pe_idx_start = (threadIdx.x - 64) * 2; + const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start; + // Vectorized load of two 16-bit values, performed as one 32-bit load + const int32_t vals = *reinterpret_cast(&k_pe[src_idx]); // RoPE values start after the packed 8-bit NoPE values and the // 32-bit scales - const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx; - kv_cache_16bit[dst_idx] = k_pe[src_idx]; + const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start; + // Vectorized store of two 16-bit values, performed as one 32-bit store + *reinterpret_cast(&kv_cache_16bit[dst_idx]) = vals; return; } - // Determine the scale for each chunk of NoPE - const int16_t tile_idx = threadIdx.x >> 7; - const int16_t warp_idx = (threadIdx.x & 127) >> 5; - const int16_t lane_idx = threadIdx.x & 31; - - // Load the NoPE element for this thread into registers - const int64_t src_idx = token_idx * kv_c_stride + threadIdx.x; - const scalar_t src_val = kv_c[src_idx]; - - // Warp-level reduction to find the max absolute value in the warp - float max_abs = fabsf(src_val); + // The first two warps handle the NoPE part + const int8_t warp_idx = threadIdx.x >> 5; + const int8_t lane_idx = threadIdx.x & 31; + const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4); + + // Each thread handles 8 elements of NoPE + // Load the NoPE elements for this thread into registers + const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8); + // Vectorized load of eight 16-bit values, performed as an int4 load + const int4 vals_i4 = *reinterpret_cast(&kv_c[src_idx_start]); + const scalar_t* vals = reinterpret_cast(&vals_i4); + + // Max absolute value of this thread's elements + float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])), + fmaxf(fabsf(vals[2]), fabsf(vals[3]))), + fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])), + fmaxf(fabsf(vals[6]), fabsf(vals[7])))); + + // Warp-level reduction to find the max absolute value in each half-warp #pragma unroll - for (int offset = 16; offset > 0; offset /= 2) { -#ifdef USE_ROCM - max_abs = fmaxf(max_abs, __shfl_down_sync(UINT64_MAX, max_abs, offset)); -#else - max_abs = fmaxf(max_abs, __shfl_down_sync(0xFFFFFFFF, max_abs, offset)); -#endif + for (int offset = 8; offset > 0; offset /= 2) { + max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16)); } - // The first lane of each warp in each tile writes the max_abs of this part - // of the tile to shared memory - if (lane_idx == 0) { - shard_abs_max[tile_idx * 4 + warp_idx] = max_abs; - } - __syncthreads(); - - // The first lane of the first warp in each tile computes the scale for the - // tile and writes it to shared memory and to kv_cache - if (warp_idx == 0 && lane_idx == 0) { - float4 shard_abs_max_vec = - reinterpret_cast(shard_abs_max)[tile_idx]; - float tile_scale = fmaxf(fmaxf(shard_abs_max_vec.x, shard_abs_max_vec.y), - fmaxf(shard_abs_max_vec.z, shard_abs_max_vec.w)) / - 448.f; - - // Avoid division by zero in `scaled_convert` - tile_scales[tile_idx] = fmaxf(tile_scale, FLT_MIN); + // Compute the scale for the tile + float tile_scale = max_abs / 448.f; + tile_scale = fmaxf(tile_scale, FLT_MIN); + + // The first lane of each half-warp writes the scale to kv_cache + if ((lane_idx == 0) || (lane_idx == 16)) { float* kv_cache_32bit = reinterpret_cast(&kv_cache[dst_idx_start]); const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx; - kv_cache_32bit[dst_idx] = tile_scales[tile_idx]; + kv_cache_32bit[dst_idx] = tile_scale; } - __syncthreads(); + // Now all threads in the block scale and write their elements + // NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes) + const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8); + + uint8_t result[8]; +#pragma unroll + for (int i = 0; i < 8; i++) { + result[i] = + fp8::scaled_convert( + vals[i], tile_scale); + } - // Now all threads in the block scale and write their element - const float scale_val = tile_scales[tile_idx]; - const int64_t dst_idx = dst_idx_start + threadIdx.x; - kv_cache[dst_idx] = - fp8::scaled_convert( - src_val, scale_val); + // Store as aligned 64-bit writes + *reinterpret_cast(&kv_cache[dst_idx_base]) = + *reinterpret_cast(result); } template @@ -571,6 +575,70 @@ __global__ void indexer_k_quant_and_cache_kernel( } } +template +__global__ void cp_gather_indexer_k_quant_cache_kernel( + const char* __restrict__ kv_cache, // [num_blocks, block_size, + // cache_stride] + char* __restrict__ dst_k, // [num_tokens, head_dim] + char* __restrict__ dst_scale, // [num_tokens, head_dim / quant_block_size * + // 4] + const int* __restrict__ block_table, // [batch_size, num_blocks] + const int* __restrict__ cu_seq_lens, // [batch_size + 1] + const int batch_size, // batch size + const int64_t token_stride, // stride for each token in dst_k + const int64_t head_dim, // dimension of each head + const int64_t block_stride, // stride for each block in kv_cache + const int64_t cache_token_stride, // stride for each token in kv_cache + const int64_t cache_block_size, // num_tokens for each block in kv_cache + const int num_blocks, // number of blocks + const int num_tokens, // number of tokens + const int quant_block_size // quantization block size +) { + constexpr int VEC_SIZE = sizeof(float4) / sizeof(char); + const int token_idx = blockIdx.x * blockDim.y + threadIdx.y; + const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE; + // Find batch index within a block + __shared__ int batch_idx[BLOCK_Y_SIZE]; + for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x)); + iter++) { + int tid = iter * blockDim.x + threadIdx.x; + if (tid < batch_size) { + const int seq_start = cu_seq_lens[tid]; + const int seq_end = cu_seq_lens[tid + 1]; + if (token_idx >= seq_start && token_idx < seq_end) { + batch_idx[threadIdx.y] = tid; + } + } + } + +#ifndef USE_ROCM + __syncwarp(); +#endif + + if (head_idx >= head_dim || token_idx >= num_tokens) { + return; + } + const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]]; + const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks + + inbatch_seq_idx / cache_block_size]; + const int64_t src_block_offset = block_idx * block_stride; + const int64_t cache_inblock_offset = + (inbatch_seq_idx % cache_block_size) * head_dim + head_idx; + const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset; + const int64_t dst_inblock_offset = token_idx * token_stride + head_idx; + + reinterpret_cast(dst_k)[dst_inblock_offset / VEC_SIZE] = + reinterpret_cast(kv_cache)[src_inblock_offset / VEC_SIZE]; + ; + if (threadIdx.x == 0) { + const int64_t src_scale_offset = + src_block_offset + cache_block_size * head_dim + + cache_inblock_offset * 4 / quant_block_size; + reinterpret_cast(dst_scale)[dst_inblock_offset / quant_block_size] = + reinterpret_cast(kv_cache)[src_scale_offset / 4]; + } +} + } // namespace vllm // KV_T is the data type of key and value tensors. @@ -606,9 +674,10 @@ void reshape_and_cache( int key_stride = key.stride(0); int value_stride = value.stride(0); + int head_div_x = head_size / x; dim3 grid(num_tokens); - dim3 block(std::min(num_heads * head_size, 512)); + dim3 block(std::min(num_heads * head_div_x, 512)); const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); @@ -741,13 +810,12 @@ void concat_and_cache_mla( if (kv_cache_dtype == "fp8_ds_mla") { dim3 grid(num_tokens); - // For the NoPE part, each tile of 128 elements is handled by 4 warps - // (128 threads). There are 4 total tiles, so 16 warps (512 threads). - // The first thread of the first warp in each tile writes the scale - // value for the tile. The RoPE part (last 64 elements) is handled - // by another 2 warps (64 threads). - // So in total, we use 18 warps (576 threads) per block. - dim3 block(576); + // For the NoPE part, each tile of 128 elements is handled by half of one + // warp (16 threads). There are 4 total tiles, so 2 warps (64 threads). + // Lanes 0 and 16 of each warp write the scale values for that warp's tiles. + // The RoPE part (last 64 elements) is handled by another 1 warp (32 + // threads). So in total, we use 3 warps (96 threads) per block. + dim3 block(96); DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype, CALL_CONCAT_AND_CACHE_DS_MLA); } else { @@ -1172,3 +1240,59 @@ void indexer_k_quant_and_cache( DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3", CALL_INDEXER_K_QUANT_AND_CACHE); } + +// Macro to dispatch the kernel based on the data amount. +#define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE) \ + vllm::cp_gather_indexer_k_quant_cache_kernel \ + <<>>( \ + reinterpret_cast(kv_cache.data_ptr()), \ + reinterpret_cast(dst_k.data_ptr()), \ + reinterpret_cast(dst_scale.data_ptr()), \ + block_table.data_ptr(), cu_seq_lens.data_ptr(), \ + batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0), \ + kv_cache.stride(1), kv_cache.size(1), block_table.size(1), \ + num_tokens, quant_block_size); + +void cp_gather_indexer_k_quant_cache( + const torch::Tensor& kv_cache, // [num_blocks, block_size, cache_stride] + torch::Tensor& dst_k, // [num_tokens, head_dim] + torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4] + const torch::Tensor& block_table, // [batch_size, num_blocks] + const torch::Tensor& cu_seq_lens // [batch_size + 1] +) { + int batch_size = block_table.size(0); + int num_tokens = dst_k.size(0); + int head_dim = dst_k.size(1); + int quant_block_size = head_dim * 4 / dst_scale.size(1); + + TORCH_CHECK(kv_cache.device() == dst_k.device(), + "kv_cache and dst_k must be on the same device"); + TORCH_CHECK(kv_cache.device() == dst_scale.device(), + "kv_cache and dst_scale must be on the same device"); + TORCH_CHECK(kv_cache.device() == block_table.device(), + "kv_cache and block_table must be on the same device"); + TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(), + "kv_cache and cu_seq_lens must be on the same device"); + TORCH_CHECK(head_dim % quant_block_size == 0, + "head_dim must be divisible by quant_block_size"); + + constexpr int vec_size = 16; + const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + if (num_tokens < 32) { + CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1); + } else if (num_tokens < 64) { + CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2); + } else if (num_tokens < 128) { + CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4); + } else if (num_tokens < 256) { + CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8); + } else if (num_tokens < 512) { + CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16); + } else { + CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32); + } +} diff --git a/csrc/core/batch_invariant.hpp b/csrc/core/batch_invariant.hpp new file mode 100644 index 0000000000000000000000000000000000000000..fffe96b868575d17b9191ab610a2862ff00b0a43 --- /dev/null +++ b/csrc/core/batch_invariant.hpp @@ -0,0 +1,19 @@ +#pragma once +#include +#include +#include + +namespace vllm { + +// vllm_is_batch_invariant(); returns true +// if env VLLM_BATCH_INVARIANT=1 +inline bool vllm_is_batch_invariant() { + static bool cached = []() { + std::string env_key = "VLLM_BATCH_INVARIANT"; + const char* val = std::getenv(env_key.c_str()); + return (val && std::atoi(val) != 0) ? 1 : 0; + }(); + return cached; +} + +} // namespace vllm diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp deleted file mode 100644 index 82862fea7f2be78bb562bfce673b324b3ad97b3d..0000000000000000000000000000000000000000 --- a/csrc/cpu/attention.cpp +++ /dev/null @@ -1,798 +0,0 @@ -#include "cpu_types.hpp" - -namespace { - -template -struct KernelVecType { - using q_load_vec_type = void; - using q_vec_type = void; - using k_load_vec_type = void; - using k_vec_type = void; - using qk_acc_vec_type = void; - using v_load_vec_type = void; -}; - -template <> -struct KernelVecType { - using q_load_vec_type = vec_op::FP32Vec4; - using q_vec_type = vec_op::FP32Vec16; - using k_load_vec_type = vec_op::FP32Vec16; - using k_vec_type = vec_op::FP32Vec16; - using qk_acc_vec_type = vec_op::FP32Vec16; - using v_load_vec_type = vec_op::FP32Vec16; -}; - -template <> -struct KernelVecType { -#if defined(__powerpc64__) || defined(__s390x__) - // Power and s390x architecture-specific vector types - using q_load_vec_type = vec_op::FP32Vec8; - using k_load_vec_type = vec_op::FP32Vec16; - using v_load_vec_type = vec_op::FP32Vec16; -#else - // Fallback for other architectures, including x86 - using q_load_vec_type = vec_op::FP16Vec8; - using k_load_vec_type = vec_op::FP16Vec16; - using v_load_vec_type = vec_op::FP16Vec16; -#endif - using q_vec_type = vec_op::FP32Vec16; - using k_vec_type = vec_op::FP32Vec16; - using qk_acc_vec_type = vec_op::FP32Vec16; -}; - -#ifdef __AVX512BF16__ -template <> -struct KernelVecType { - using q_load_vec_type = vec_op::BF16Vec8; - using q_vec_type = vec_op::BF16Vec32; - using k_load_vec_type = vec_op::BF16Vec32; - using k_vec_type = vec_op::BF16Vec32; - using qk_acc_vec_type = vec_op::FP32Vec16; - using v_load_vec_type = vec_op::BF16Vec16; -}; -#else - #ifdef __aarch64__ - #ifndef ARM_BF16_SUPPORT - // pass - #else -template <> -struct KernelVecType { - using q_load_vec_type = vec_op::BF16Vec8; - using q_vec_type = vec_op::FP32Vec16; - using k_load_vec_type = vec_op::BF16Vec16; - using k_vec_type = vec_op::FP32Vec16; - using qk_acc_vec_type = vec_op::FP32Vec16; - using v_load_vec_type = vec_op::BF16Vec16; -}; - #endif - #else -template <> -struct KernelVecType { - using q_load_vec_type = vec_op::BF16Vec8; - using q_vec_type = vec_op::FP32Vec16; - using k_load_vec_type = vec_op::BF16Vec16; - using k_vec_type = vec_op::FP32Vec16; - using qk_acc_vec_type = vec_op::FP32Vec16; - using v_load_vec_type = vec_op::BF16Vec16; -}; - #endif -#endif - -template -FORCE_INLINE std::pair reduceSoftmax(T* data, const int size, - const int capacity) { - T max = data[0]; - for (int i = 1; i < size; ++i) { - max = max >= data[i] ? max : data[i]; - } - - T sum = 0; - for (int i = 0; i < size; ++i) { - data[i] = std::exp(data[i] - max); - sum += data[i]; - } - - int i = 0; - for (; i < size; ++i) { - data[i] /= sum; - } - - for (; i < capacity; ++i) { - data[i] = 0; - } - - return {max, sum}; -} - -template -FORCE_INLINE std::pair reduceSoftmaxAlibi(T* data, const int size, - const int capacity, - const float alibi_slope, - const int start_index, - const int seq_len) { - data[0] += alibi_slope * (start_index - seq_len + 1); - T max = data[0]; - for (int i = 1; i < size; ++i) { - T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1); - data[i] = qk; - max = max >= qk ? max : qk; - } - - T sum = 0; - for (int i = 0; i < size; ++i) { - data[i] = std::exp(data[i] - max); - sum += data[i]; - } - - int i = 0; - for (; i < size; ++i) { - data[i] /= sum; - } - - for (; i < capacity; ++i) { - data[i] = 0; - } - - return {max, sum}; -} - -template -FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data, - const int size) { - T max = max_data[0]; - for (int i = 1; i < size; ++i) { - max = max >= max_data[i] ? max : max_data[i]; - } - - T rescaled_sum = 0; - for (int i = 0; i < size; ++i) { - T rescale_factor = std::exp(max_data[i] - max); - rescaled_sum += rescale_factor * sum_data[i]; - sum_data[i] *= rescale_factor; - } - for (int i = 0; i < size; ++i) { - sum_data[i] /= rescaled_sum + 1e-8; - } -} - -template -struct reduceQKBlockKernel { - using q_load_vec_type = typename KernelVecType::q_load_vec_type; - using q_vec_type = typename KernelVecType::q_vec_type; - using k_load_vec_type = typename KernelVecType::k_load_vec_type; - using k_vec_type = typename KernelVecType::k_vec_type; - using qk_acc_vec_type = typename KernelVecType::qk_acc_vec_type; - - constexpr static int TOKEN_PER_GROUP = k_load_vec_type::get_elem_num() / x; - constexpr static int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP; - constexpr static int UNROLL_GROUP_NUM = MAX_GROUP_NUM / 4; - - static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4); - static_assert(k_load_vec_type::get_elem_num() % x == 0); - static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16); - - FORCE_INLINE static void call(const scalar_t* __restrict__ q, - const scalar_t* __restrict__ k_block, - float* __restrict__ logits, float scale, - const int token_num) { - const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP; - - qk_acc_vec_type group_accums[MAX_GROUP_NUM]; - if (token_num == BLOCK_SIZE) { - for (int q_offset = 0; q_offset < HEAD_SIZE; - q_offset += x, k_block += x * BLOCK_SIZE) { - q_load_vec_type q_load_group_vec(q + q_offset); - q_vec_type q_group_vec(q_load_group_vec); - - vec_op::unroll_loop( - [k_block, &q_group_vec, &group_accums](int token_group_idx) { - k_load_vec_type k_load_group_vec(k_block + token_group_idx * x * - TOKEN_PER_GROUP); - k_vec_type k_group_vec(k_load_group_vec); - vec_op::fma(group_accums[token_group_idx], q_group_vec, - k_group_vec); - vec_op::prefetch(k_block + x * BLOCK_SIZE + - token_group_idx * x * TOKEN_PER_GROUP); - }); - } - } else { - for (int q_offset = 0; q_offset < HEAD_SIZE; - q_offset += x, k_block += x * BLOCK_SIZE) { - q_load_vec_type q_load_group_vec(q + q_offset); - q_vec_type q_group_vec(q_load_group_vec); - for (int token_group_start = 0; token_group_start < group_num; - token_group_start += UNROLL_GROUP_NUM) { - vec_op::unroll_loop( - [token_group_start, k_block, &q_group_vec, - &group_accums](int token_group_idx) { - token_group_idx += token_group_start; - k_load_vec_type k_load_group_vec(k_block + token_group_idx * x * - TOKEN_PER_GROUP); - k_vec_type k_group_vec(k_load_group_vec); - vec_op::fma(group_accums[token_group_idx], q_group_vec, - k_group_vec); - vec_op::prefetch(k_block + x * BLOCK_SIZE + - token_group_idx * x * TOKEN_PER_GROUP); - }); - } - } - } - - for (int token_group_idx = 0; token_group_idx < group_num; - ++token_group_idx) { - vec_op::unroll_loop( - [&group_accums, logits, scale, token_group_idx](int token_idx) { - float dot_v = - group_accums[token_group_idx] - .template reduce_sub_sum(token_idx); - logits[token_group_idx * TOKEN_PER_GROUP + token_idx] = - dot_v * scale; - }); - } - } -}; - -template -FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block, - acc_t&& acc) { - using v_load_vec_type = typename KernelVecType::v_load_vec_type; - constexpr int ELEM_NUM = v_load_vec_type::get_elem_num(); - static_assert(BLOCK_SIZE == ELEM_NUM); - vec_op::FP32Vec16 prob_vec(prob); - - vec_op::unroll_loop([&](int head_elem_idx) { - v_load_vec_type v_vec(v_block + BLOCK_SIZE * head_elem_idx); - vec_op::FP32Vec16 fp32_v_vec(v_vec); - acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec; - }); -} -}; // namespace - -// Paged attention v1 -namespace { -template -struct paged_attention_v1_impl { - static void call( - scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, - // head_size/x, block_size, x] - const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, - // head_size, block_size] - const int num_kv_heads, const float scale, - const int* __restrict__ block_tables, // [num_seqs, - // max_num_blocks_per_seq] - const int* __restrict__ seq_lens, // [num_seqs] - const int max_num_blocks_per_seq, - const float* __restrict__ alibi_slopes, // [num_heads] - const int q_stride, const int kv_block_stride, const int kv_head_stride, - const int num_seqs, const int num_heads) { - constexpr int x = 16 / sizeof(scalar_t); - const int num_queries_per_kv = num_heads / num_kv_heads; - - static_assert(BLOCK_SIZE == 16); - - int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE; - int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0; - TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0); - - const int parallel_work_item_num = omp_get_max_threads(); - - size_t logits_bytes = - parallel_work_item_num * max_seq_len_padded * sizeof(float); - float* logits = (float*)std::aligned_alloc( - 64, logits_bytes); // Cacheline alignment for each context token. - // [parallel_work_item_num, max_seq_len_padded] - -#pragma omp parallel for collapse(2) schedule(dynamic, 1) - for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) { - for (int head_idx = 0; head_idx < num_heads; ++head_idx) { - int seq_len = seq_lens[seq_idx]; - const int* seq_block_table = - block_tables + max_num_blocks_per_seq * seq_idx; - const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE; - const int64_t kv_head_idx = head_idx / num_queries_per_kv; - const scalar_t* __restrict__ q_vec_ptr = - q + seq_idx * q_stride + head_idx * HEAD_SIZE; - const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE; - float* __restrict__ thread_block_logits = - logits + omp_get_thread_num() * max_seq_len_padded; - - // Compute logits - for (int block_idx = 0; block_idx < block_num; ++block_idx) { - const int64_t physical_block_idx = seq_block_table[block_idx]; - const scalar_t* __restrict__ k_block_cache_ptr = - k_cache + physical_block_idx * kv_block_stride + - kv_head_idx * kv_head_stride; - float* __restrict__ head_block_logits = - thread_block_logits + block_idx * BLOCK_SIZE; - - reduceQKBlockKernel::call( - q_vec_ptr, k_block_cache_ptr, head_block_logits, scale, - block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE); - } - - // Compute softmax - if (alibi_slopes) { - reduceSoftmaxAlibi(thread_block_logits, seq_len, - block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0, - seq_len); - } else { - reduceSoftmax(thread_block_logits, seq_len, block_num * BLOCK_SIZE); - } - - // Compute value - constexpr int head_elem_num_per_partition = 16; - constexpr int head_partition_num = - HEAD_SIZE / head_elem_num_per_partition; - for (int head_part_idx = 0; head_part_idx < head_partition_num; - ++head_part_idx) { - vec_op::FP32Vec16 accums[head_elem_num_per_partition]; - scalar_t* __restrict__ out_ptr = - out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + - head_part_idx * head_elem_num_per_partition; - for (int block_idx = 0; block_idx < block_num; ++block_idx) { - const int64_t physical_block_idx = seq_block_table[block_idx]; - const float* __restrict__ prob_vec_ptr = - thread_block_logits + block_idx * BLOCK_SIZE; - const scalar_t* __restrict__ v_block_cache_ptr = - v_cache + physical_block_idx * kv_block_stride + - kv_head_idx * kv_head_stride + - BLOCK_SIZE * head_part_idx * head_elem_num_per_partition; - reduceValueBlock( - prob_vec_ptr, v_block_cache_ptr, accums); - - if (block_idx != block_num - 1) { - const int64_t next_physical_block_idx = - seq_block_table[block_idx + 1]; - const scalar_t* __restrict__ next_v_block_cache_ptr = - v_cache + next_physical_block_idx * kv_block_stride + - kv_head_idx * kv_head_stride + - BLOCK_SIZE * head_part_idx * head_elem_num_per_partition; - vec_op::unroll_loop( - [&](int head_elem_idx) { - if (head_elem_idx % 2 == 0) { - vec_op::prefetch(next_v_block_cache_ptr + - BLOCK_SIZE * head_elem_idx); - } - }); - } - } - - vec_op::unroll_loop( - [&](int head_elem_idx) { - float value = accums[head_elem_idx].reduce_sum(); - vec_op::storeFP32(value, out_ptr + head_elem_idx); - }); - } - } - } - std::free(logits); - } -}; - -#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \ - paged_attention_v1_impl::call( \ - out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \ - block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ - alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs, \ - num_heads); - -template -void paged_attention_v1_impl_launcher( - torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const std::optional& alibi_slopes) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); - - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = - alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; - - T* out_ptr = reinterpret_cast(out.data_ptr()); - T* query_ptr = reinterpret_cast(query.data_ptr()); - T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr(); - int* seq_lens_ptr = seq_lens.data_ptr(); - - switch (head_size) { - case 32: - LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE); - break; - case 64: - LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE); - break; - case 80: - LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE); - break; - case 96: - LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE); - break; - case 112: - LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE); - break; - case 128: - LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE); - break; - case 192: - LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE); - break; - case 256: - LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; - } -} - -#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE) \ - paged_attention_v1_impl_launcher( \ - out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ - seq_lens, max_seq_len, alibi_slopes); - -#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T) \ - switch (block_size) { \ - case 16: \ - CALL_V1_KERNEL_LAUNCHER(T, 16); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ - } -} // namespace - -void paged_attention_v1( - torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int64_t num_kv_heads, double scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, torch::Tensor& k_scale, - torch::Tensor& v_scale, const int64_t tp_rank, - const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, - const int64_t blocksparse_head_sliding_step) { - TORCH_CHECK(blocksparse_vert_stride <= 1, - "CPU backend does not support blocksparse attention yet."); - VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl", - [&] { - CPU_KERNEL_GUARD_IN(paged_attention_v1_impl) - CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t); - CPU_KERNEL_GUARD_OUT(paged_attention_v1_impl) - }); -} - -// Paged attention v2 -namespace { -template -struct paged_attention_v2_impl { - static void call( - scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] - float* __restrict__ exp_sums, // [num_seqs, num_heads, - // max_num_partitions] - float* __restrict__ max_logits, // [num_seqs, num_heads, - // max_num_partitions] - scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, - // max_num_partitions, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, - // head_size/x, block_size, x] - const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, - // head_size, block_size] - const int num_kv_heads, const float scale, - const int* __restrict__ block_tables, // [num_seqs, - // max_num_blocks_per_seq] - const int* __restrict__ seq_lens, // [num_seqs] - const int max_num_blocks_per_seq, - const float* __restrict__ alibi_slopes, // [num_heads] - const int q_stride, const int kv_block_stride, const int kv_head_stride, - const int num_seqs, const int num_heads, const int max_num_partitions) { - constexpr int x = 16 / sizeof(scalar_t); - const int num_queries_per_kv = num_heads / num_kv_heads; - - static_assert(BLOCK_SIZE == 16); - static_assert(PARTITION_SIZE * sizeof(float) % 64 == 0); - static_assert(PARTITION_SIZE % BLOCK_SIZE == 0); - -#pragma omp parallel for collapse(3) schedule(static, 1) - for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) { - for (int partition_idx = 0; partition_idx < max_num_partitions; - ++partition_idx) { - for (int head_idx = 0; head_idx < num_heads; ++head_idx) { - const int seq_len = seq_lens[seq_idx]; - const int start_token_idx = partition_idx * PARTITION_SIZE; - - if (start_token_idx >= seq_len) continue; - - const int partition_num = - (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE; - const bool no_reduce = (partition_num == 1); - const int token_num = - (std::min(seq_len, start_token_idx + PARTITION_SIZE) - - start_token_idx); - const int block_num = (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE; - const int last_block_token_num = - token_num - (block_num - 1) * BLOCK_SIZE; - const int* seq_block_table = block_tables + - max_num_blocks_per_seq * seq_idx + - start_token_idx / BLOCK_SIZE; - const int64_t kv_head_idx = head_idx / num_queries_per_kv; - const scalar_t* __restrict__ q_vec_ptr = - q + seq_idx * q_stride + head_idx * HEAD_SIZE; - - float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0}; - - // Compute logits - for (int block_idx = 0; block_idx < block_num; ++block_idx) { - const int64_t physical_block_idx = seq_block_table[block_idx]; - const scalar_t* __restrict__ k_block_cache_ptr = - k_cache + physical_block_idx * kv_block_stride + - kv_head_idx * kv_head_stride; - float* __restrict__ head_block_logits = - logits + block_idx * BLOCK_SIZE; - - reduceQKBlockKernel::call( - q_vec_ptr, k_block_cache_ptr, head_block_logits, scale, - block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE); - } - - std::pair max_and_sum; - if (alibi_slopes) { - max_and_sum = reduceSoftmaxAlibi( - logits, token_num, block_num * BLOCK_SIZE, - alibi_slopes[head_idx], start_token_idx, seq_len); - } else { - max_and_sum = - reduceSoftmax(logits, token_num, block_num * BLOCK_SIZE); - } - - auto&& [max_logit, exp_sum] = max_and_sum; - - scalar_t* __restrict__ output_buffer = nullptr; - if (!no_reduce) { - auto idx = seq_idx * num_heads * max_num_partitions + - head_idx * max_num_partitions + partition_idx; - max_logits[idx] = max_logit; - exp_sums[idx] = exp_sum; - output_buffer = - tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + - head_idx * max_num_partitions * HEAD_SIZE + - partition_idx * HEAD_SIZE; - } else { - output_buffer = - out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE; - } - - // Compute value - constexpr int head_elem_num_per_partition = 16; - constexpr int head_partition_num = - HEAD_SIZE / head_elem_num_per_partition; - for (int head_part_idx = 0; head_part_idx < head_partition_num; - ++head_part_idx) { - vec_op::FP32Vec16 accums[head_elem_num_per_partition]; - scalar_t* __restrict__ out_ptr = - output_buffer + head_part_idx * head_elem_num_per_partition; - for (int block_idx = 0; block_idx < block_num; ++block_idx) { - const int64_t physical_block_idx = seq_block_table[block_idx]; - const float* __restrict__ prob_vec_ptr = - logits + block_idx * BLOCK_SIZE; - const scalar_t* __restrict__ v_block_cache_ptr = - v_cache + physical_block_idx * kv_block_stride + - kv_head_idx * kv_head_stride + - BLOCK_SIZE * head_part_idx * head_elem_num_per_partition; - reduceValueBlock( - prob_vec_ptr, v_block_cache_ptr, accums); - - if (block_idx != block_num - 1) { - const int64_t next_physical_block_idx = - seq_block_table[block_idx + 1]; - const scalar_t* __restrict__ next_v_block_cache_ptr = - v_cache + next_physical_block_idx * kv_block_stride + - kv_head_idx * kv_head_stride + - BLOCK_SIZE * head_part_idx * head_elem_num_per_partition; - vec_op::unroll_loop( - [&](int head_elem_idx) { - if (head_elem_idx % 2 == 0) { - vec_op::prefetch(next_v_block_cache_ptr + - BLOCK_SIZE * head_elem_idx); - } - }); - } - } - - vec_op::unroll_loop( - [&](int head_elem_idx) { - float value = accums[head_elem_idx].reduce_sum(); - vec_op::storeFP32(value, out_ptr + head_elem_idx); - }); - } - } - } - } - - // Rescale partition softmax and store the factors to exp_sums -#pragma omp parallel for collapse(2) schedule(static, 1) - for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) { - for (int head_idx = 0; head_idx < num_heads; ++head_idx) { - const int seq_len = seq_lens[seq_idx]; - const int partition_num = - (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE; - - if (partition_num == 1) continue; - - reducePartitionSoftmax( - max_logits + seq_idx * num_heads * max_num_partitions + - head_idx * max_num_partitions, - exp_sums + seq_idx * num_heads * max_num_partitions + - head_idx * max_num_partitions, - partition_num); - } - } - - // Reduce values - using v_load_vec_type = typename KernelVecType::v_load_vec_type; - static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE); - constexpr int head_elem_num_per_group = - 16; // Note: didn't align with the cacheline size, due to some - // HEAD_SIZE didn't align with 64 bytes - static_assert(HEAD_SIZE % head_elem_num_per_group == 0); - constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group; - const float* __restrict__ rescale_factors = exp_sums; -#pragma omp parallel for collapse(3) schedule(static, 1) - for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) { - for (int head_idx = 0; head_idx < num_heads; ++head_idx) { - for (int group_idx = 0; group_idx < head_group_num; ++group_idx) { - const int seq_len = seq_lens[seq_idx]; - const int partition_num = - (seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE; - - if (partition_num == 1) continue; - - const float* __restrict__ seq_head_rescale_factors = - rescale_factors + seq_idx * num_heads * max_num_partitions + - head_idx * max_num_partitions; - const scalar_t* __restrict__ seq_head_tmp_out = - tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + - head_idx * max_num_partitions * HEAD_SIZE + - group_idx * head_elem_num_per_group; - scalar_t* __restrict__ seq_head_output = - out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + - group_idx * head_elem_num_per_group; - - vec_op::FP32Vec16 acc; - for (int i = 0; i < partition_num; ++i) { - vec_op::FP32Vec16 rescale_factor(seq_head_rescale_factors[i]); - v_load_vec_type value(seq_head_tmp_out + i * HEAD_SIZE); - vec_op::FP32Vec16 fp32_value(value); - acc = acc + fp32_value * rescale_factor; - } - v_load_vec_type cast_acc(acc); - cast_acc.save(seq_head_output); - } - } - } - } -}; - -#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \ - paged_attention_v2_impl::call( \ - out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, \ - key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ - seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ - kv_block_stride, kv_head_stride, num_seqs, num_heads, \ - max_num_partitions); - -template -void paged_attention_v2_impl_launcher( - torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, - torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size, - int max_seq_len, const std::optional& alibi_slopes) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); - int max_num_partitions = exp_sums.size(-1); - - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = - alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; - - T* out_ptr = reinterpret_cast(out.data_ptr()); - float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); - float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); - T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); - T* query_ptr = reinterpret_cast(query.data_ptr()); - T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr(); - int* seq_lens_ptr = seq_lens.data_ptr(); - - switch (head_size) { - case 32: - LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE); - break; - case 64: - LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE); - break; - case 80: - LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE); - break; - case 96: - LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE); - break; - case 112: - LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE); - break; - case 128: - LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE); - break; - case 192: - LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE); - break; - case 256: - LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; - } -} - -#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE) \ - paged_attention_v2_impl_launcher( \ - out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ - num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, \ - alibi_slopes); - -#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T) \ - switch (block_size) { \ - case 16: \ - CALL_V2_KERNEL_LAUNCHER(T, 16); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ - } -} // namespace - -void paged_attention_v2( - torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, - torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int64_t num_kv_heads, double scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, torch::Tensor& k_scale, - torch::Tensor& v_scale, const int64_t tp_rank, - const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, - const int64_t blocksparse_head_sliding_step) { - TORCH_CHECK(blocksparse_vert_stride <= 1, - "CPU backend does not support blocksparse attention yet."); - VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl", - [&] { - CPU_KERNEL_GUARD_IN(paged_attention_v2_impl) - CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t); - CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl) - }); -} \ No newline at end of file diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp deleted file mode 100644 index 69f6d06e3c9679f28000a262df0520bb2a549c9d..0000000000000000000000000000000000000000 --- a/csrc/cpu/cache.cpp +++ /dev/null @@ -1,214 +0,0 @@ -#include -#include - -#include "cpu_types.hpp" - -#if defined(__x86_64__) - #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2 -#else - #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES -#endif - -namespace { -template -void copy_blocks_cpu_impl(std::vector const& key_caches, - std::vector const& value_caches, - const torch::Tensor& mapping_pairs, - const int element_num_per_block, - const int layer_num) { - const size_t pair_num = mapping_pairs.size(0); - const size_t block_bytes = sizeof(scalar_t) * element_num_per_block; -#pragma omp parallel for collapse(2) - for (int layer = 0; layer < layer_num; ++layer) { - for (size_t pair = 0; pair < pair_num; ++pair) { - int64_t source_offset = - element_num_per_block * mapping_pairs[pair][0].item(); - int64_t target_offset = - element_num_per_block * mapping_pairs[pair][1].item(); - scalar_t* key_cache_ptr = key_caches[layer].data_ptr(); - scalar_t* source_ptr = key_cache_ptr + source_offset; - scalar_t* target_ptr = key_cache_ptr + target_offset; - std::memcpy(target_ptr, source_ptr, block_bytes); - - scalar_t* value_cache_ptr = value_caches[layer].data_ptr(); - source_ptr = value_cache_ptr + source_offset; - target_ptr = value_cache_ptr + target_offset; - std::memcpy(target_ptr, source_ptr, block_bytes); - } - } -} - -template -void reshape_and_cache_cpu_impl( - const scalar_t* __restrict__ key, const scalar_t* __restrict__ value, - scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache, - const int64_t* __restrict__ slot_mapping, const int num_tokens, - const int key_stride, const int value_stride, const int num_heads, - const int head_size, const int block_size, const int x) { - const int block_elem_num = num_heads * head_size * block_size; - -#pragma omp parallel for collapse(2) - for (int token_idx = 0; token_idx < num_tokens; ++token_idx) { - for (int head_idx = 0; head_idx < num_heads; ++head_idx) { - const int64_t slot_idx = slot_mapping[token_idx]; - if (slot_idx >= 0) { - int src_key_head_idx = token_idx * key_stride + head_idx * head_size; - int src_value_head_idx = - token_idx * value_stride + head_idx * head_size; - const scalar_t* src_key_head_ptr = key + src_key_head_idx; - const scalar_t* src_value_head_ptr = value + src_value_head_idx; - const int64_t block_index = slot_idx / block_size; - const int64_t block_offset = slot_idx % block_size; - scalar_t* target_key_head_ptr = key_cache + - block_elem_num * block_index + - head_idx * block_size * head_size; - scalar_t* target_value_head_ptr = value_cache + - block_elem_num * block_index + - head_idx * block_size * head_size; - - for (int src_key_idx = 0; src_key_idx < head_size; src_key_idx += x) { - const int64_t target_offset = - src_key_idx * block_size + block_offset * x; - for (int i = 0; i < x; ++i) { - target_key_head_ptr[target_offset + i] = - src_key_head_ptr[src_key_idx + i]; - } - } - - for (int src_value_idx = 0; src_value_idx < head_size; - ++src_value_idx) { - const int64_t target_offset = - src_value_idx * block_size + block_offset; - target_value_head_ptr[target_offset] = - src_value_head_ptr[src_value_idx]; - } - } - } - } -} -}; // namespace - -template -void concat_and_cache_mla_cpu_impl( - const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank] - const scalar_t* __restrict__ k_pe, // [num_tokens, pe_dim] - scalar_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank - // + pe_dim)] - const int64_t* __restrict__ slot_mapping, // [num_tokens] - const int num_tokens, // - const int block_stride, // - const int entry_stride, // - const int kv_c_stride, // - const int k_pe_stride, // - const int kv_lora_rank, // - const int pe_dim, // - const int block_size // -) { -#pragma omp parallel for - for (int token_idx = 0; token_idx < num_tokens; ++token_idx) { - const int64_t slot_idx = slot_mapping[token_idx]; - // NOTE: slot_idx can be -1 if the token is padded - if (slot_idx < 0) { - continue; - } - const int64_t block_idx = slot_idx / block_size; - const int64_t block_offset = slot_idx % block_size; - - auto copy = [&](const scalar_t* __restrict__ src, - scalar_t* __restrict__ dst, int src_stride, int dst_stride, - int size, int offset) { - for (int i = 0; i < size; i++) { - const int64_t src_idx = token_idx * src_stride + i; - const int64_t dst_idx = - block_idx * block_stride + block_offset * entry_stride + i + offset; - dst[dst_idx] = src[src_idx]; - } - }; - - copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0); - copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank); - } -} - -// Note: the key_caches and value_caches vectors are constant but -// not the Tensors they contain. The vectors need to be const refs -// in order to satisfy pytorch's C++ operator registration code. -void copy_blocks(std::vector const& key_caches, - std::vector const& value_caches, - const torch::Tensor& block_mapping) { - unsigned num_layers = key_caches.size(); - TORCH_CHECK(num_layers == value_caches.size()); - if (num_layers == 0) { - return; - } - - const int element_num_per_block = key_caches[0][0].numel(); - DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] { - CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl) - copy_blocks_cpu_impl(key_caches, value_caches, block_mapping, - element_num_per_block, num_layers); - CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl) - }); -} - -void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, - torch::Tensor& key_cache, torch::Tensor& value_cache, - torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype, - torch::Tensor& k_scale, torch::Tensor& v_scale) { - int num_tokens = key.size(0); - int num_heads = key.size(1); - int head_size = key.size(2); - int block_size = key_cache.size(3); - int x = key_cache.size(4); - - int key_stride = key.stride(0); - int value_stride = value.stride(0); - - DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] { - CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl) - reshape_and_cache_cpu_impl( - key.data_ptr(), value.data_ptr(), - key_cache.data_ptr(), value_cache.data_ptr(), - slot_mapping.data_ptr(), num_tokens, key_stride, value_stride, - num_heads, head_size, block_size, x); - CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl) - }); -} - -void concat_and_cache_mla( - torch::Tensor& kv_c, // [num_tokens, kv_lora_rank] - torch::Tensor& k_pe, // [num_tokens, pe_dim] - torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank + - // pe_dim)] - torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens] - const std::string& kv_cache_dtype, torch::Tensor& scale) { - int num_tokens = slot_mapping.size(0); - int kv_lora_rank = kv_c.size(1); - int pe_dim = k_pe.size(1); - int block_size = kv_cache.size(1); - - TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim); - TORCH_CHECK(kv_cache_dtype != "fp8"); - - int kv_c_stride = kv_c.stride(0); - int k_pe_stride = k_pe.stride(0); - int block_stride = kv_cache.stride(0); - int entry_stride = kv_cache.stride(1); - - VLLM_DISPATCH_FLOATING_TYPES( - kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] { - CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl) - concat_and_cache_mla_cpu_impl( - kv_c.data_ptr(), k_pe.data_ptr(), - kv_cache.data_ptr(), slot_mapping.data_ptr(), - num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride, - kv_lora_rank, pe_dim, block_size); - CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl) - }); -} - -void swap_blocks(torch::Tensor& src, torch::Tensor& dst, - const torch::Tensor& block_mapping) { - TORCH_CHECK(false, "swap_blocks is unsupported on CPU.") -} diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp new file mode 100644 index 0000000000000000000000000000000000000000..50f17c758c148774fe331c30200232f18c35b7bb --- /dev/null +++ b/csrc/cpu/cpu_attn.cpp @@ -0,0 +1,249 @@ +#include "cpu_attn_vec.hpp" +#include "cpu_attn_vec16.hpp" + +#ifdef CPU_CAPABILITY_AMXBF16 + #include "cpu_attn_amx.hpp" + #define AMX_DISPATCH(...) \ + case cpu_attention::ISA::AMX: { \ + using attn_impl = cpu_attention::AttentionImpl; \ + return __VA_ARGS__(); \ + } +#else + #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX: +#endif + +#define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \ + case HEAD_DIM: { \ + constexpr size_t head_dim = HEAD_DIM; \ + return __VA_ARGS__(); \ + } + +#define CPU_ATTN_DISPATCH_CASE_HEADDIM(HEAD_DIM, ...) \ + [&] { \ + switch (HEAD_DIM) { \ + CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__) \ + CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__) \ + CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__) \ + CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__) \ + CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__) \ + CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__) \ + CPU_ATTN_DISPATCH_CASE(224, __VA_ARGS__) \ + CPU_ATTN_DISPATCH_CASE(256, __VA_ARGS__) \ + default: { \ + TORCH_CHECK(false, "Invalid CPU attention head_dim: " + \ + std::to_string(HEAD_DIM)); \ + } \ + } \ + }() + +#define CPU_ATTN_DISPATCH_IMPL(ISA_TYPE, ...) \ + [&] { \ + switch (ISA_TYPE) { \ + AMX_DISPATCH(__VA_ARGS__) \ + case cpu_attention::ISA::VEC: { \ + using attn_impl = \ + cpu_attention::AttentionImpl; \ + return __VA_ARGS__(); \ + } \ + case cpu_attention::ISA::VEC16: { \ + using attn_impl = \ + cpu_attention::AttentionImpl; \ + return __VA_ARGS__(); \ + } \ + default: { \ + TORCH_CHECK(false, "Invalid CPU attention ISA type."); \ + } \ + } \ + }() + +torch::Tensor get_scheduler_metadata( + const int64_t num_req, const int64_t num_heads_q, + const int64_t num_heads_kv, const int64_t head_dim, + const torch::Tensor& seq_lens, at::ScalarType dtype, + const torch::Tensor& query_start_loc, const bool casual, + const int64_t window_size, const std::string& isa_hint, + const bool enable_kv_split) { + cpu_attention::ISA isa; + if (isa_hint == "amx") { + isa = cpu_attention::ISA::AMX; + } else if (isa_hint == "vec") { + isa = cpu_attention::ISA::VEC; + } else if (isa_hint == "vec16") { + isa = cpu_attention::ISA::VEC16; + } else { + TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint); + } + + cpu_attention::AttentionScheduler::ScheduleInput input; + input.num_reqs = num_req; + input.num_heads_q = num_heads_q; + input.num_heads_kv = num_heads_kv; + input.head_dim = head_dim; + input.query_start_loc = query_start_loc.data_ptr(); + input.seq_lens = seq_lens.data_ptr(); + if (window_size != -1) { + input.left_sliding_window_size = window_size - 1; + if (casual) { + input.right_sliding_window_size = 0; + } else { + input.right_sliding_window_size = window_size - 1; + } + } else { + input.left_sliding_window_size = -1; + if (casual) { + input.right_sliding_window_size = 0; + } else { + input.right_sliding_window_size = -1; + } + } + input.casual = casual; + input.isa = isa; + input.enable_kv_split = enable_kv_split; + TORCH_CHECK(casual, "Only supports casual mask for now."); + + VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() { + CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] { + CPU_ATTN_DISPATCH_IMPL(isa, [&]() { + input.elem_size = sizeof(scalar_t); + input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t); + input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t); + input.output_buffer_elem_size = + sizeof(attn_impl::partial_output_buffer_t); + input.max_num_q_per_iter = attn_impl::MaxQHeadNumPerIteration; + input.kv_block_alignment = attn_impl::BlockSizeAlignment; + }); + }); + }); + + cpu_attention::AttentionScheduler scheduler; + torch::Tensor metadata = scheduler.schedule(input); + return metadata; +} + +void cpu_attn_reshape_and_cache( + const torch::Tensor& key, // [token_num, head_num, head_size] + const torch::Tensor& value, // [token_num, head_num, head_size] + torch::Tensor& + key_cache, // [num_blocks, num_kv_heads, block_size, head_size] + torch::Tensor& + value_cache, // [num_blocks, num_kv_heads, block_size, head_size] + const torch::Tensor& slot_mapping, const std::string& isa) { + TORCH_CHECK_EQ(key.dim(), 3); + TORCH_CHECK_EQ(value.dim(), 3); + TORCH_CHECK_EQ(key_cache.dim(), 4); + TORCH_CHECK_EQ(value_cache.dim(), 4); + TORCH_CHECK_EQ(key.stride(2), 1); + TORCH_CHECK_EQ(value.stride(2), 1); + + const int64_t token_num = key.size(0); + const int64_t key_token_num_stride = key.stride(0); + const int64_t value_token_num_stride = value.stride(0); + const int64_t head_num = value.size(1); + const int64_t key_head_num_stride = key.stride(1); + const int64_t value_head_num_stride = value.stride(1); + const int64_t num_blocks = key_cache.size(0); + const int64_t num_blocks_stride = key_cache.stride(0); + const int64_t cache_head_num_stride = key_cache.stride(1); + const int64_t block_size = key_cache.size(2); + const int64_t block_size_stride = key_cache.stride(2); + const int64_t head_dim = key.size(-1); + + cpu_attention::ISA isa_tag = [&]() { + if (isa == "amx") { + return cpu_attention::ISA::AMX; + } else if (isa == "vec") { + return cpu_attention::ISA::VEC; + } else if (isa == "vec16") { + return cpu_attention::ISA::VEC16; + } else { + TORCH_CHECK(false, "Invalid ISA type: " + isa); + } + }(); + + VLLM_DISPATCH_FLOATING_TYPES( + key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() { + CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] { + CPU_ATTN_DISPATCH_IMPL(isa_tag, [&]() { + attn_impl::reshape_and_cache( + key.data_ptr(), value.data_ptr(), + key_cache.data_ptr(), + value_cache.data_ptr(), + slot_mapping.data_ptr(), token_num, + key_token_num_stride, value_token_num_stride, head_num, + key_head_num_stride, value_head_num_stride, num_blocks, + num_blocks_stride, cache_head_num_stride, block_size, + block_size_stride); + }); + }); + }); +} + +void cpu_attention_with_kv_cache( + const torch::Tensor& query, // [num_tokens, num_heads, head_size] + const torch::Tensor& + key_cache, // [num_blocks, num_kv_heads, block_size, head_size] + const torch::Tensor& + value_cache, // [num_blocks, num_kv_heads, block_size, head_size] + torch::Tensor& output, // [num_tokens, num_heads, head_size] + const torch::Tensor& query_start_loc, // [num_tokens + 1] + const torch::Tensor& seq_lens, // [num_tokens] + const double scale, const bool causal, + const std::optional& alibi_slopes, // [num_heads] + const int64_t sliding_window_left, const int64_t sliding_window_right, + const torch::Tensor& block_table, // [num_tokens, max_block_num] + const double softcap, const torch::Tensor& scheduler_metadata, + const std::optional& s_aux // [num_heads] +) { + TORCH_CHECK_EQ(query.dim(), 3); + TORCH_CHECK_EQ(query.stride(2), 1); + TORCH_CHECK_EQ(key_cache.dim(), 4); + TORCH_CHECK_EQ(value_cache.dim(), 4); + + cpu_attention::AttentionInput input; + input.metadata = reinterpret_cast( + scheduler_metadata.data_ptr()); + input.num_tokens = query.size(0); + input.num_heads = query.size(1); + input.num_kv_heads = key_cache.size(1); + input.block_size = key_cache.size(2); + input.query = query.data_ptr(); + input.query_num_tokens_stride = query.stride(0); + input.query_num_heads_stride = query.stride(1); + input.cache_num_blocks_stride = key_cache.stride(0); + input.cache_num_kv_heads_stride = key_cache.stride(1); + input.blt_num_tokens_stride = block_table.stride(0); + input.key_cache = key_cache.data_ptr(); + input.value_cache = value_cache.data_ptr(); + input.output = output.data_ptr(); + input.query_start_loc = query_start_loc.data_ptr(); + input.seq_lens = seq_lens.data_ptr(); + input.block_table = block_table.data_ptr(); + input.alibi_slopes = + alibi_slopes.has_value() ? alibi_slopes->data_ptr() : nullptr; + // For now sink must be bf16 + input.s_aux = s_aux.has_value() ? s_aux->data_ptr() : nullptr; + input.scale = scale; + input.causal = causal; + input.sliding_window_left = sliding_window_left; + input.sliding_window_right = sliding_window_right; + if (input.causal) { + // to make boundary calculation easier + input.sliding_window_right = 0; + } + float softcap_fp32 = softcap; + input.softcap = softcap_fp32; + + VLLM_DISPATCH_FLOATING_TYPES( + query.scalar_type(), "cpu_attention_with_kv_cache", [&]() { + CPU_ATTN_DISPATCH_CASE_HEADDIM(query.size(2), [&] { + CPU_ATTN_DISPATCH_IMPL(input.metadata->isa, [&]() { + TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0); + cpu_attention::AttentionMainLoop mainloop; + mainloop(&input); + }); + }); + }); +} diff --git a/csrc/cpu/cpu_attn_amx.hpp b/csrc/cpu/cpu_attn_amx.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8da458b99119c31667ff875eeb947e5979f65968 --- /dev/null +++ b/csrc/cpu/cpu_attn_amx.hpp @@ -0,0 +1,511 @@ +#ifndef CPU_ATTN_AMX_HPP +#define CPU_ATTN_AMX_HPP + +#include "cpu_attn_impl.hpp" + +namespace cpu_attention { +namespace { +// AMX specific +constexpr static int64_t AMX_TILE_ROW_BYTES = 64; +constexpr static int64_t AMX_TILE_ROW_NUM = 16; +constexpr static int64_t AMX_TILE_BYTES = AMX_TILE_ROW_BYTES * AMX_TILE_ROW_NUM; + +typedef struct __tile_config { + uint8_t palette_id = 1; + uint8_t start_row = 0; + uint8_t reserved_0[14] = {0}; + uint16_t colsb[16] = {0}; + uint8_t rows[16] = {0}; +} __tilecfg; + +// 2-2-4 pattern, for 16 < m <= 32 +// TILE 0, 1: load A matrix, row num should be 16, m - 16 +// TILE 2, 3: load B matrix, row num should be 16 +// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m +// - 16 +template +class TileGemm224 { + public: + template + FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile, + void* __restrict__ b_tile, + float* __restrict__ c_tile, const int64_t lda, + const int64_t ldb, const int64_t ldc, + const int32_t block_size, + const int32_t dynamic_k_size, + const bool accum_c) { + TORCH_CHECK(false, "Unsupported kv cache type for TileGemm224"); + } + + FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) { + TORCH_CHECK(false, "Unsupported kv cache type for TileGemm224"); + } +}; + +template <> +class TileGemm224 { + public: + template + FORCE_INLINE static void gemm(const int32_t m_size, + c10::BFloat16* __restrict__ a_tile, + c10::BFloat16* __restrict__ b_tile, + float* __restrict__ c_tile, const int64_t lda, + const int64_t ldb, const int64_t ldc, + const int32_t block_size, + const int32_t dynamic_k_size, + const bool accum_c) { + const int32_t k_times = + dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16)); + c10::BFloat16* __restrict__ a_tile_0 = a_tile; + c10::BFloat16* __restrict__ a_tile_1 = a_tile + lda * AMX_TILE_ROW_NUM; + const int64_t a_tile_stride = [&]() { + if constexpr (phase == AttentionGemmPhase::QK) { + // q_buffer is prepacked + return AMX_TILE_ROW_BYTES; + } else if constexpr (phase == AttentionGemmPhase::PV) { + // logits_buffer is row-major + return lda * sizeof(c10::BFloat16); + } else { + TORCH_CHECK(false, "Unreachable"); + } + }(); + + c10::BFloat16* __restrict__ b_tile_2 = b_tile; + c10::BFloat16* __restrict__ b_tile_3 = [&]() { + if constexpr (phase == AttentionGemmPhase::QK) { + // k_cache is prepacked + return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4); + } else if constexpr (phase == AttentionGemmPhase::PV) { + // v_cache is prepacked + return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4); + } else { + TORCH_CHECK(false, "Unreachable"); + } + }(); + // k_cache, v_cache are prepacked + const int32_t b_tile_stride = AMX_TILE_ROW_BYTES; + + // logits_buffer, output_buffer are not prepacked + float* __restrict__ c_tile_4 = c_tile; + float* __restrict__ c_tile_5 = + c_tile_4 + AMX_TILE_ROW_BYTES / sizeof(float); + float* __restrict__ c_tile_6 = c_tile + AMX_TILE_ROW_NUM * ldc; + float* __restrict__ c_tile_7 = + c_tile_6 + AMX_TILE_ROW_BYTES / sizeof(float); + const int32_t c_tile_stride = ldc * sizeof(float); + + if (accum_c) { + _tile_loadd(4, c_tile_4, c_tile_stride); + _tile_loadd(5, c_tile_5, c_tile_stride); + _tile_loadd(6, c_tile_6, c_tile_stride); + _tile_loadd(7, c_tile_7, c_tile_stride); + } else { + _tile_zero(4); + _tile_zero(5); + _tile_zero(6); + _tile_zero(7); + } + + for (int32_t k = 0; k < k_times; ++k) { + _tile_loadd(0, a_tile_0, a_tile_stride); + _tile_stream_loadd(2, b_tile_2, b_tile_stride); + _tile_dpbf16ps(4, 0, 2); + _tile_stream_loadd(3, b_tile_3, b_tile_stride); + _tile_dpbf16ps(5, 0, 3); + _tile_loadd(1, a_tile_1, a_tile_stride); + _tile_dpbf16ps(6, 1, 2); + _tile_dpbf16ps(7, 1, 3); + + // update ptrs + if constexpr (phase == AttentionGemmPhase::QK) { + // Q buffer is prepacked + a_tile_0 += AMX_TILE_BYTES / sizeof(c10::BFloat16); + a_tile_1 += AMX_TILE_BYTES / sizeof(c10::BFloat16); + } else if constexpr (phase == AttentionGemmPhase::PV) { + // P buffer is not prepacked + a_tile_0 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + a_tile_1 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + } else { + TORCH_CHECK(false, "Unreachable"); + } + b_tile_2 += AMX_TILE_BYTES / sizeof(c10::BFloat16); + b_tile_3 += AMX_TILE_BYTES / sizeof(c10::BFloat16); + } + + _tile_stored(4, c_tile_4, c_tile_stride); + _tile_stored(5, c_tile_5, c_tile_stride); + _tile_stored(6, c_tile_6, c_tile_stride); + _tile_stored(7, c_tile_7, c_tile_stride); + } + + FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) { + const int32_t m_0 = AMX_TILE_ROW_NUM; + const int32_t m_1 = m - AMX_TILE_ROW_NUM; + config.rows[0] = m_0; + config.rows[1] = m_1; + config.rows[2] = AMX_TILE_ROW_NUM; + config.rows[3] = AMX_TILE_ROW_NUM; + config.rows[4] = m_0; + config.rows[5] = m_0; + config.rows[6] = m_1; + config.rows[7] = m_1; + _tile_loadconfig(&config); + } +}; + +// 1-2-2 pattern, for 0 < m <= 16 +// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be +// m, m +// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row +// num should be 16 +// TILE 6, 7, (6, 7): store results C matrix, row num should be +// m +template +class TileGemm122 { + public: + template + FORCE_INLINE static void gemm(const int32_t m_size, void* __restrict__ a_tile, + void* __restrict__ b_tile, + float* __restrict__ c_tile, const int64_t lda, + const int64_t ldb, const int64_t ldc, + const int32_t block_size, + const int32_t dynamic_k_size, + const bool accum_c) { + TORCH_CHECK(false, "Unsupported kv cache type for TileGemm122"); + } + + FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) { + TORCH_CHECK(false, "Unsupported kv cache type for TileGemm122"); + } +}; + +template <> +class TileGemm122 { + public: + template + FORCE_INLINE static void gemm(const int32_t m_size, + c10::BFloat16* __restrict__ a_tile, + c10::BFloat16* __restrict__ b_tile, + float* __restrict__ c_tile, const int64_t lda, + const int64_t ldb, const int64_t ldc, + const int32_t block_size, + const int32_t dynamic_k_size, + const bool accum_c) { + c10::BFloat16* __restrict__ a_tile_0 = a_tile; + c10::BFloat16* __restrict__ a_tile_1 = [&]() { + if constexpr (phase == AttentionGemmPhase::QK) { + // q_buffer is prepacked + return a_tile + AMX_TILE_BYTES / sizeof(c10::BFloat16); + } else if constexpr (phase == AttentionGemmPhase::PV) { + // logits_buffer is row-major + return a_tile + AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + } else { + TORCH_CHECK(false, "Unreachable"); + } + }(); + const int64_t a_tile_stride = [&]() { + if constexpr (phase == AttentionGemmPhase::QK) { + // q_buffer is prepacked + return AMX_TILE_ROW_BYTES; + } else if constexpr (phase == AttentionGemmPhase::PV) { + // logits_buffer is row-major + return lda * sizeof(c10::BFloat16); + } else { + TORCH_CHECK(false, "Unreachable"); + } + }(); + + c10::BFloat16* __restrict__ b_tile_2 = b_tile; + c10::BFloat16* __restrict__ b_tile_3 = [&]() { + if constexpr (phase == AttentionGemmPhase::QK) { + // k_cache is prepacked + return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4); + } else if constexpr (phase == AttentionGemmPhase::PV) { + // v_cache is prepacked + return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4); + } else { + TORCH_CHECK(false, "Unreachable"); + } + }(); + c10::BFloat16* __restrict__ b_tile_4 = + b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16); + c10::BFloat16* __restrict__ b_tile_5 = + b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16); + int64_t b_stride = AMX_TILE_ROW_BYTES; + + float* __restrict__ c_tile_6 = c_tile; + float* __restrict__ c_tile_7 = c_tile + AMX_TILE_ROW_BYTES / sizeof(float); + int64_t c_stride = ldc * sizeof(float); + + const int32_t k_times = + dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16)); + const int32_t k_group_times = k_times / 2; + const bool has_tail = (k_times % 2 == 1); + + if (accum_c) { + _tile_loadd(6, c_tile_6, c_stride); + _tile_loadd(7, c_tile_7, c_stride); + } else { + _tile_zero(6); + _tile_zero(7); + } + + for (int32_t k = 0; k < k_group_times; ++k) { + _tile_loadd(0, a_tile_0, a_tile_stride); + _tile_stream_loadd(2, b_tile_2, b_stride); + _tile_dpbf16ps(6, 0, 2); + _tile_stream_loadd(3, b_tile_3, b_stride); + _tile_dpbf16ps(7, 0, 3); + _tile_loadd(1, a_tile_1, a_tile_stride); + _tile_stream_loadd(4, b_tile_4, b_stride); + _tile_dpbf16ps(6, 1, 4); + _tile_stream_loadd(5, b_tile_5, b_stride); + _tile_dpbf16ps(7, 1, 5); + + // update ptrs + if constexpr (phase == AttentionGemmPhase::QK) { + // Q buffer is prepacked + a_tile_0 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + a_tile_1 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + } else if constexpr (phase == AttentionGemmPhase::PV) { + // P buffer is not prepacked + a_tile_0 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + a_tile_1 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16); + } + b_tile_2 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + b_tile_3 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + b_tile_4 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + b_tile_5 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16); + } + + if (has_tail) { + _tile_loadd(0, a_tile_0, a_tile_stride); + _tile_stream_loadd(2, b_tile_2, b_stride); + _tile_dpbf16ps(6, 0, 2); + _tile_stream_loadd(3, b_tile_3, b_stride); + _tile_dpbf16ps(7, 0, 3); + } + + _tile_stored(6, c_tile_6, c_stride); + _tile_stored(7, c_tile_7, c_stride); + } + + FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) { + config.rows[0] = m; + config.rows[1] = m; + config.rows[2] = AMX_TILE_ROW_NUM; + config.rows[3] = AMX_TILE_ROW_NUM; + config.rows[4] = AMX_TILE_ROW_NUM; + config.rows[5] = AMX_TILE_ROW_NUM; + config.rows[6] = m; + config.rows[7] = m; + _tile_loadconfig(&config); + } +}; +} // namespace + +template +class AttentionImpl { + public: + using query_t = scalar_t; + using q_buffer_t = scalar_t; + using kv_cache_t = scalar_t; + using logits_buffer_t = float; + using partial_output_buffer_t = float; + using prob_buffer_t = scalar_t; + + constexpr static int64_t BlockSizeAlignment = + AMX_TILE_ROW_BYTES / + sizeof(kv_cache_t); // KV token num unit of QK and PV phases + constexpr static int64_t HeadDimAlignment = + 2 * (AMX_TILE_ROW_BYTES / 4); // headdim num unit of PV phase + constexpr static int64_t MaxQHeadNumPerIteration = 32; + constexpr static int64_t HeadDim = head_dim; + constexpr static ISA ISAType = ISA::AMX; + constexpr static bool scale_on_logits = true; + + public: + AttentionImpl() : current_q_head_num_(0) { + // Use all columns in AMX tiles + vec_op::unroll_loop([&](int i) { amx_tile_config_.colsb[i] = 64; }); + } + + ~AttentionImpl() { _tile_release(); } + + template