diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index e29881fcbac0175b7cbcd93c82fbecd8d9d59b59..68aff793ae6aa55ac1eedc130446c0b5e7046f2a 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py index 270663c415c7206b82bde00377da3f45ecc08b70..7045d8810493e5c79d670a11401b99cf16268a2e 100644 --- a/.buildkite/generate_index.py +++ b/.buildkite/generate_index.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os diff --git a/.buildkite/lm-eval-harness/conftest.py b/.buildkite/lm-eval-harness/conftest.py index 769d2efda4adc494cd9e78074b30b4a721cb279a..c0d60dd5328f454b91dda36b216a09696671b0bc 100644 --- a/.buildkite/lm-eval-harness/conftest.py +++ b/.buildkite/lm-eval-harness/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path import pytest diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 409a6ca82008243ace99a3c4dc735305cdb1730c..930adfaf3e192febfa09cd34b8cb8f0e05d2dd7a 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ LM eval harness on model to compare vs HF baseline computed offline. Configs are found in configs/$MODEL.yaml diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index d3f5fc5cd4cee6a40d2d2a3fcb4677259f173b23..72c52d5bb5e9ba8fe9b9602862ed2a9b20aa9ab0 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do ### Visualizing the results -The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. +The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. If you do not see the table, please wait till the benchmark finish running. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 7f2a2d8dc2969275bd0739da8dd8c976ad728c42..a4f1638c1adb8db7336960f9e227b23b054181a8 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import os diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py index 778a3a8d87f63f3aba83d21d2a36b8159e7f81b9..8532ff7ef798cc039926ef8781fccab71f5e17a9 100644 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py index 10a7a2f5a467e7ba9fefc08914a4af033b89b163..053fd52c35ae906710cbe08579becf0eafc72c4b 100644 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py index e5f179a0f5b68b4f684869bc652827f69d6266ef..ddea1d2b1b1ed5710f5f878c663e83f388a62b73 100644 --- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py +++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from lmdeploy.serve.openai.api_client import APIClient diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 2a7b37991f31a0e4a553e3f3b300b4cc37d19da4..fb3b9d5e34e03c7708f560301a7b802f3e3906bc 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime import json diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index b3c27e2c99c2b2c2741871bd4276cb6929139ef1..16b5ad0297fe79ced4bbb25f7a1ced0c58425c22 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,5 +1,6 @@ steps: - label: "Build wheel - CUDA 12.8" + id: build-wheel-cuda-12-8 agents: queue: cpu_queue_postmerge commands: @@ -11,6 +12,7 @@ steps: DOCKER_BUILDKIT: "1" - label: "Build wheel - CUDA 12.6" + id: build-wheel-cuda-12-6 agents: queue: cpu_queue_postmerge commands: @@ -28,6 +30,7 @@ steps: - label: "Build wheel - CUDA 11.8" # depends_on: block-build-cu118-wheel + id: build-wheel-cuda-11-8 agents: queue: cpu_queue_postmerge commands: @@ -44,6 +47,7 @@ steps: - label: "Build release image" depends_on: block-release-image-build + id: build-release-image agents: queue: cpu_queue_postmerge commands: @@ -51,6 +55,18 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + - label: "Annotate release workflow" + depends_on: + - build-release-image + - build-wheel-cuda-12-8 + - build-wheel-cuda-12-6 + - build-wheel-cuda-11-8 + id: annotate-release-workflow + agents: + queue: cpu_queue_postmerge + commands: + - "bash .buildkite/scripts/annotate-release.sh" + - label: "Build and publish TPU release image" depends_on: ~ if: build.env("NIGHTLY") == "1" @@ -70,9 +86,10 @@ steps: DOCKER_BUILDKIT: "1" - input: "Provide Release version here" + id: input-release-version fields: - text: "What is the release version?" - key: "release-version" + key: release-version - block: "Build CPU release image" key: block-cpu-release-image-build diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh new file mode 100755 index 0000000000000000000000000000000000000000..94e0ac2398f34ecf5f5b1c957a0373d6a61c9218 --- /dev/null +++ b/.buildkite/scripts/annotate-release.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +set -ex + +# Get release version and strip leading 'v' if present +RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//') + +if [ -z "$RELEASE_VERSION" ]; then + echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid." + exit 1 +fi + +buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF +To download the wheel: +\`\`\` +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . +\`\`\` + +To download and upload the image: + +\`\`\` +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai +docker tag vllm/vllm-openai vllm/vllm-openai:latest +docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION} +docker push vllm/vllm-openai:latest +docker push vllm/vllm-openai:v${RELEASE_VERSION} +\`\`\` +EOF \ No newline at end of file diff --git a/.buildkite/scripts/ci-clean-log.sh b/.buildkite/scripts/ci-clean-log.sh new file mode 100644 index 0000000000000000000000000000000000000000..69d8a3a288316b547a85e39caec5021d15fca563 --- /dev/null +++ b/.buildkite/scripts/ci-clean-log.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Usage: ./ci_clean_log.sh ci.log +# This script strips timestamps and color codes from CI log files. + +# Check if argument is given +if [ $# -lt 1 ]; then + echo "Usage: $0 ci.log" + exit 1 +fi + +INPUT_FILE="$1" + +# Strip timestamps +sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE" + +# Strip colorization +sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE" diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index bbc896ec68190b5b05b47be6d0a6c8e1c4d8ef7d..6e9af1e721bb70cdf38914ea32f48156bbb7248f 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} fi +if [[ $commands == *"pytest -v -s lora"* ]]; then + commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"} +fi + #ignore certain kernels tests if [[ $commands == *" kernels/core"* ]]; then commands="${commands} \ diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh index 077bd9914907945d5a99f964eda7377a3ed71294..36bcb015d308ebf096662060c4e9726db10e0927 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh @@ -7,6 +7,7 @@ set -ex # Setup cleanup remove_docker_container() { if [[ -n "$container_id" ]]; then + podman stop --all -t0 podman rm -f "$container_id" || true fi podman system prune -f @@ -37,7 +38,7 @@ function cpu_tests() { pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] - pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]" + pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" } # All of CPU tests are expected to be finished less than 40 mins. diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 40f3df96065d184cad52e2e59625add4750c7587..bbcde4009c0eb9e9f30ed14bb4b7cfc7a25ea0a2 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -6,72 +6,70 @@ set -ex # allow to bind to different cores CORE_RANGE=${CORE_RANGE:-48-95} +OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} +export CMAKE_BUILD_PARALLEL_LEVEL=32 + # Setup cleanup remove_docker_container() { set -e; - docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; - docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; + docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; } trap remove_docker_container EXIT remove_docker_container # Try building the docker image -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu . -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e export NUMA_NODE=$2 - export BUILDKITE_BUILD_NUMBER=$3 # offline inference - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " set -e python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" # Run basic model test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -v -s tests/kernels/test_cache.py -m cpu_model - pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model - pytest -v -s tests/models/decoder_only/language -m cpu_model - pytest -v -s tests/models/embedding/language -m cpu_model - pytest -v -s tests/models/encoder_decoder/language -m cpu_model - pytest -v -s tests/models/decoder_only/audio_language -m cpu_model - pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" + pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model + pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model + pytest -v -s tests/models/language/generation -m cpu_model + pytest -v -s tests/models/language/pooling -m cpu_model + pytest -v -s tests/models/multimodal/generation \ + --ignore=tests/models/multimodal/generation/test_mllama.py \ + --ignore=tests/models/multimodal/generation/test_pixtral.py \ + -m cpu_model" # Run compressed-tensor test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -s -v \ + VLLM_USE_V1=0 pytest -s -v \ tests/quantization/test_ipex_quant.py" # Run chunked-prefill and prefix-cache test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v -k cpu_model \ tests/basic_correctness/test_chunked_prefill.py" # online serving - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - export VLLM_CPU_KVCACHE_SPACE=10 - export VLLM_CPU_OMP_THREADS_BIND=$1 python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 python3 benchmarks/benchmark_serving.py \ @@ -83,7 +81,7 @@ function cpu_tests() { --tokenizer facebook/opt-125m" # Run multi-lora tests - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/lora/test_qwen2vl.py" @@ -91,4 +89,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER" +timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 2d375d7e9d8711502bfc104737972ee78fc482c3..a2a5c2a02cbb9776eda2a42fa4232861bb82896c 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -2,102 +2,184 @@ set -xu -# Build the docker image. -docker build -f docker/Dockerfile.tpu -t vllm-tpu . -# Set up cleanup. -remove_docker_container() { docker rm -f tpu-test || true; } +remove_docker_container() { + docker rm -f tpu-test || true; + docker rm -f vllm-tpu || true; +} + trap remove_docker_container EXIT + # Remove the container that might not be cleaned up in the previous run. remove_docker_container +# Build the docker image. +docker build -f docker/Dockerfile.tpu -t vllm-tpu . + +# Set up cleanup. +cleanup_docker() { + # Get Docker's root directory + docker_root=$(docker info -f '{{.DockerRootDir}}') + if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 + fi + echo "Docker root directory: $docker_root" + # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') + # Define the threshold + threshold=70 + if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." + else + echo "Disk usage is below $threshold%. No cleanup needed." + fi +} +cleanup_docker + # For HF_TOKEN. source /etc/environment -# Run a simple end-to-end example. + docker run --privileged --net host --shm-size=16G -it \ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ - vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ - && python3 -m pip install pytest pytest-asyncio tpu-info \ - && python3 -m pip install lm_eval[api]==0.4.4 \ - && export VLLM_XLA_CACHE_PATH= \ - && export VLLM_USE_V1=1 \ - && export VLLM_XLA_CHECK_RECOMPILATION=1 \ - && echo HARDWARE \ - && tpu-info \ - && { \ - echo TEST_0: Running test_perf.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \ - echo TEST_0_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_1: Running test_compilation.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \ - echo TEST_1_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_2: Running test_basic.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \ - echo TEST_2_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ - python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ - echo TEST_3_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_4: Running test_quantization_accuracy.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \ - echo TEST_4_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_5: Running examples/offline_inference/tpu.py; \ - python3 /workspace/vllm/examples/offline_inference/tpu.py; \ - echo TEST_5_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_6: Running test_tpu_model_runner.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \ - echo TEST_6_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_7: Running test_sampler.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \ - echo TEST_7_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_8: Running test_topk_topp_sampler.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \ - echo TEST_8_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_9: Running test_multimodal.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \ - echo TEST_9_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_10: Running test_pallas.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \ - echo TEST_10_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_11: Running test_struct_output_generate.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \ - echo TEST_11_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_12: Running test_moe_pallas.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \ - echo TEST_12_EXIT_CODE: \$?; \ - } & \ - # Disable the TPU LoRA tests until the feature is activated - # & { \ - # echo TEST_13: Running test_moe_pallas.py; \ - # python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \ - # echo TEST_13_EXIT_CODE: \$?; \ - # } & \ - wait \ - && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \ -" + vllm-tpu /bin/bash -c ' +set -e # Exit immediately if a command exits with a non-zero status. +set -u # Treat unset variables as an error. + +echo "--- Starting script inside Docker container ---" + +# Create results directory +RESULTS_DIR=$(mktemp -d) +# If mktemp fails, set -e will cause the script to exit. +echo "Results will be stored in: $RESULTS_DIR" + +# Install dependencies +echo "--- Installing Python dependencies ---" +python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ + && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ + && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 +echo "--- Python dependencies installed ---" +export VLLM_USE_V1=1 +export VLLM_XLA_CHECK_RECOMPILATION=1 +export VLLM_XLA_CACHE_PATH= +echo "Using VLLM V1" + +echo "--- Hardware Information ---" +tpu-info +echo "--- Starting Tests ---" +set +e +overall_script_exit_code=0 + +# --- Test Definitions --- +# If a test fails, this function will print logs and will not cause the main script to exit. +run_test() { + local test_num=$1 + local test_name=$2 + local test_command=$3 + local log_file="$RESULTS_DIR/test_${test_num}.log" + local actual_exit_code + + echo "--- TEST_$test_num: Running $test_name ---" + + # Execute the test command. + eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) + actual_exit_code=$? + + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log + + if [ "$actual_exit_code" -ne 0 ]; then + echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 + echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 + if [ -f "$log_file" ]; then + cat "$log_file" >&2 + else + echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 + fi + echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 + return "$actual_exit_code" # Return the failure code + else + echo "TEST_$test_num ($test_name) PASSED." + return 0 # Return success + fi +} + +# Helper function to call run_test and update the overall script exit code +run_and_track_test() { + local test_num_arg="$1" + local test_name_arg="$2" + local test_command_arg="$3" + + # Run the test + run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" + local test_specific_exit_code=$? + + # If the test failed, set the overall script exit code to 1 + if [ "$test_specific_exit_code" -ne 0 ]; then + # No need for extra echo here, run_test already logged the failure. + overall_script_exit_code=1 + fi +} + +# --- Actual Test Execution --- +run_and_track_test 0 "test_perf.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py" +run_and_track_test 1 "test_compilation.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py" +run_and_track_test 2 "test_basic.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py" +run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \ + "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" +run_and_track_test 4 "test_quantization_accuracy.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py" +run_and_track_test 5 "examples/offline_inference/tpu.py" \ + "python3 /workspace/vllm/examples/offline_inference/tpu.py" +run_and_track_test 6 "test_tpu_model_runner.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py" +run_and_track_test 7 "test_sampler.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" +run_and_track_test 8 "test_topk_topp_sampler.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" +run_and_track_test 9 "test_multimodal.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" +run_and_track_test 10 "test_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" +run_and_track_test 11 "test_struct_output_generate.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" +run_and_track_test 12 "test_moe_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" +run_and_track_test 13 "test_lora.py" \ + "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" +run_and_track_test 14 "test_tpu_qkv_linear.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" +run_and_track_test 15 "test_spmd_model_weight_loading.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" + +# After all tests have been attempted, exit with the overall status. +if [ "$overall_script_exit_code" -ne 0 ]; then + echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" +else + echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" +fi +exit "$overall_script_exit_code" +' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. + +# Capture the exit code of the docker run command +DOCKER_RUN_EXIT_CODE=$? +# The trap will run for cleanup. +# Exit the main script with the Docker run command's exit code. +if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then + echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." + exit "$DOCKER_RUN_EXIT_CODE" +else + echo "Docker run command completed successfully." + exit 0 +fi # TODO: This test fails because it uses RANDOM_SEED sampling -# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ +# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ diff --git a/.buildkite/scripts/rerun-test.sh b/.buildkite/scripts/rerun-test.sh new file mode 100644 index 0000000000000000000000000000000000000000..d79c0d5f381b149c2cf14c3b14afc94820e00d70 --- /dev/null +++ b/.buildkite/scripts/rerun-test.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Usage: ./rerun_test.sh path/to/test.py::test_name + +# Check if argument is given +if [ $# -lt 1 ]; then + echo "Usage: $0 path/to/test.py::test_name" + echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]" + exit 1 +fi + +TEST=$1 +COUNT=1 + +while pytest -sv "$TEST"; do + COUNT=$((COUNT + 1)) + echo "RUN NUMBER ${COUNT}" +done diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh new file mode 100755 index 0000000000000000000000000000000000000000..209d9c4341cdd83033a92fb878ceb8e6b13d5298 --- /dev/null +++ b/.buildkite/scripts/tpu/cleanup_docker.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -euo pipefail + +docker_root=$(docker info -f '{{.DockerRootDir}}') +if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 +fi +echo "Docker root directory: $docker_root" +# Check disk usage of the filesystem where Docker's root directory is located +disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') +# Define the threshold +threshold=70 +if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." +else + echo "Disk usage is below $threshold%. No cleanup needed." +fi diff --git a/.buildkite/scripts/tpu/config_v6e_1.env b/.buildkite/scripts/tpu/config_v6e_1.env new file mode 100644 index 0000000000000000000000000000000000000000..44175864734746458fd1c0c69bf3c5d907f71848 --- /dev/null +++ b/.buildkite/scripts/tpu/config_v6e_1.env @@ -0,0 +1,14 @@ +# Environment config +TEST_NAME=llama8b +CONTAINER_NAME=vllm-tpu + +# vllm config +MODEL=meta-llama/Llama-3.1-8B-Instruct +MAX_NUM_SEQS=512 +MAX_NUM_BATCHED_TOKENS=512 +TENSOR_PARALLEL_SIZE=1 +MAX_MODEL_LEN=2048 +DOWNLOAD_DIR=/mnt/disks/persist +EXPECTED_THROUGHPUT=8.0 +INPUT_LEN=1800 +OUTPUT_LEN=128 diff --git a/.buildkite/scripts/tpu/docker_run_bm.sh b/.buildkite/scripts/tpu/docker_run_bm.sh new file mode 100755 index 0000000000000000000000000000000000000000..6705da03e3d761baf4d8849bba5444cc0e9c7c6b --- /dev/null +++ b/.buildkite/scripts/tpu/docker_run_bm.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +if [ ! -f "$1" ]; then + echo "Error: The env file '$1' does not exist." + exit 1 # Exit the script with a non-zero status to indicate an error +fi + +ENV_FILE=$1 + +# For testing on local vm, use `set -a` to export all variables +source /etc/environment +source $ENV_FILE + +remove_docker_container() { + docker rm -f tpu-test || true; + docker rm -f vllm-tpu || true; + docker rm -f $CONTAINER_NAME || true; +} + +trap remove_docker_container EXIT + +# Remove the container that might not be cleaned up in the previous run. +remove_docker_container + +# Build docker image. +# TODO: build the image outside the script and share the image with other +# tpu test if building time is too long. +DOCKER_BUILDKIT=1 docker build \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=0 \ + --tag vllm/vllm-tpu-bm \ + --progress plain -f docker/Dockerfile.tpu . + +LOG_ROOT=$(mktemp -d) +# If mktemp fails, set -e will cause the script to exit. +echo "Results will be stored in: $LOG_ROOT" + +if [ -z "$HF_TOKEN" ]; then + echo "Error: HF_TOKEN is not set or is empty." + exit 1 +fi + +# Make sure mounted disk or dir exists +if [ ! -d "$DOWNLOAD_DIR" ]; then + echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder." + exit 1 +fi + +echo "Run model $MODEL" +echo + +echo "starting docker...$CONTAINER_NAME" +echo +docker run \ + -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \ + --env-file $ENV_FILE \ + -e HF_TOKEN="$HF_TOKEN" \ + -e TARGET_COMMIT=$BUILDKITE_COMMIT \ + -e MODEL=$MODEL \ + -e WORKSPACE=/workspace \ + --name $CONTAINER_NAME \ + -d \ + --privileged \ + --network host \ + -v /dev/shm:/dev/shm \ + vllm/vllm-tpu-bm tail -f /dev/null + +echo "run script..." +echo +docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh" + +echo "copy result back..." +VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt +BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt +docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" +docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG" + +throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') +echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput" + +if [ "$BUILDKITE" = "true" ]; then + echo "Running inside Buildkite" + buildkite-agent artifact upload "$VLLM_LOG" + buildkite-agent artifact upload "$BM_LOG" +else + echo "Not running inside Buildkite" +fi + +# +# compare the throughput with EXPECTED_THROUGHPUT +# and assert meeting the expectation +# +if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then + echo "Failed to get the throughput" + exit 1 +fi + +if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then + echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)" + exit 1 +fi diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh new file mode 100755 index 0000000000000000000000000000000000000000..877669cd956ac5f5977d91ac3e0f645e73e5cd05 --- /dev/null +++ b/.buildkite/scripts/tpu/run_bm.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +set -euo pipefail + +VLLM_LOG="$WORKSPACE/vllm_log.txt" +BM_LOG="$WORKSPACE/bm_log.txt" + +if [ -n "$TARGET_COMMIT" ]; then + head_hash=$(git rev-parse HEAD) + if [ "$TARGET_COMMIT" != "$head_hash" ]; then + echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash" + exit 1 + fi +fi + +echo "model: $MODEL" +echo + +# +# create a log folder +# +mkdir "$WORKSPACE/log" + +# TODO: Move to image building. +pip install pandas +pip install datasets + +# +# create sonnet_4x +# +echo "Create sonnet_4x.txt" +echo "" > benchmarks/sonnet_4x.txt +for _ in {1..4} + do + cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt +done + +# +# start vllm service in backend +# +echo "lanching vllm..." +echo "logging to $VLLM_LOG" +echo + +VLLM_USE_V1=1 vllm serve $MODEL \ + --seed 42 \ + --disable-log-requests \ + --max-num-seqs $MAX_NUM_SEQS \ + --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ + --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ + --no-enable-prefix-caching \ + --download_dir $DOWNLOAD_DIR \ + --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 & + + +echo "wait for 20 minutes.." +echo +# sleep 1200 +# wait for 10 minutes... +for i in {1..120}; do + # TODO: detect other type of errors. + if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then + echo "Detected RuntimeError, exiting." + exit 1 + elif grep -Fq "Application startup complete" "$VLLM_LOG"; then + echo "Application started" + break + else + echo "wait for 10 seconds..." + sleep 10 + fi +done + +# +# run test +# +echo "run benchmark test..." +echo "logging to $BM_LOG" +echo +python benchmarks/benchmark_serving.py \ + --backend vllm \ + --model $MODEL \ + --dataset-name sonnet \ + --dataset-path benchmarks/sonnet_4x.txt \ + --sonnet-input-len $INPUT_LEN \ + --sonnet-output-len $OUTPUT_LEN \ + --ignore-eos > "$BM_LOG" + +echo "completed..." +echo + +throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') +echo "throughput: $throughput" +echo diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 80a5a610c8ac99b9814c55a6c0924569f340b34e..b739851cb90528b0f1b7feab14b14a6c4ded0802 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -145,6 +145,7 @@ steps: - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py - tests/v1/test_async_llm_dp.py + - tests/v1/engine/test_engine_core_client.py commands: # test with tp=2 and external_dp=2 - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py @@ -154,6 +155,7 @@ steps: # test with internal dp - python3 ../examples/offline_inference/data_parallel.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py @@ -199,8 +201,9 @@ steps: - tests/test_sequence - tests/test_config - tests/test_logger + - tests/test_vllm_port commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py # OOM in the CI unless we run this separately - pytest -v -s tokenization @@ -274,17 +277,6 @@ steps: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers -- label: LogitsProcessor Test # 5min - mirror_hardwares: [amdexperimental, amdproduction] - source_file_dependencies: - - vllm/model_executor/layers - - vllm/model_executor/guided_decoding - - tests/test_logits_processor - - tests/model_executor/test_guided_processors - commands: - - pytest -v -s test_logits_processor.py - - pytest -v -s model_executor/test_guided_processors.py - - label: Speculative decoding tests # 40min mirror_hardwares: [amdexperimental] source_file_dependencies: @@ -297,7 +289,7 @@ steps: - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: - vllm/lora - tests/lora @@ -328,6 +320,7 @@ steps: # these tests need to be separated, cannot combine - pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_toy_llama.py + - pytest -v -s compile/piecewise/test_full_cudagraph.py - label: PyTorch Fullgraph Test # 18min mirror_hardwares: [amdexperimental, amdproduction] @@ -397,6 +390,17 @@ steps: - pytest -v -s tensorizer_loader - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py +- label: Model Executor Test + mirror_hardwares: [amdexperimental, amdproduction] + soft_fail: true + source_file_dependencies: + - vllm/model_executor + - tests/model_executor + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - label: Benchmarks # 9min mirror_hardwares: [amdexperimental, amdproduction] working_dir: "/vllm-workspace/.buildkite" @@ -420,6 +424,9 @@ steps: - vllm/model_executor/layers/quantization - tests/quantization commands: + # temporary install here since we need nightly, will move to requirements/test.in + # after torchao 0.12 release + - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - label: LM Eval Small Models # 53min @@ -617,9 +624,11 @@ steps: - vllm/worker/model_runner.py - entrypoints/llm/test_collective_rpc.py - tests/v1/test_async_llm_dp.py + - tests/v1/entrypoints/openai/test_multi_api_servers.py - vllm/v1/engine/ commands: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 4452ce22d504ed1336c81ecaf0d64365962a06ea..e98ccd035ee90997f72cb1ba748b34e6e54fcdac 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -10,15 +10,17 @@ /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth -/vllm/model_executor/guided_decoding @mgoin @russellb +/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm /vllm/multimodal @DarkLight1337 @ywang96 /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee +/vllm/reasoning @aarnphm +/vllm/entrypoints @aarnphm CMakeLists.txt @tlrmchlsmth # vLLM V1 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat -/vllm/v1/structured_output @mgoin @russellb +/vllm/v1/structured_output @mgoin @russellb @aarnphm # Test ownership /.buildkite/lm-eval-harness @mgoin @simon-mo @@ -27,8 +29,8 @@ CMakeLists.txt @tlrmchlsmth /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao -/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo -/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb +/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm +/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm /tests/kernels @tlrmchlsmth @WoosukKwon /tests/model_executor/test_guided_processors.py @mgoin @russellb /tests/models @DarkLight1337 @ywang96 @@ -38,11 +40,11 @@ CMakeLists.txt @tlrmchlsmth /tests/quantization @mgoin @robertgshaw2-redhat /tests/spec_decode @njhill @LiuXiaoxuanPKU /tests/test_inputs.py @DarkLight1337 @ywang96 -/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb -/tests/v1/structured_output @mgoin @russellb +/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm +/tests/v1/structured_output @mgoin @russellb @aarnphm /tests/weight_loading @mgoin @youkaichao /tests/lora @jeejeelee # Docs /docs @hmellor -mkdocs.yaml @hmellor \ No newline at end of file +mkdocs.yaml @hmellor diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml index f05be2ba8707afcffdc89759263137e8d6bc8d72..8c5c28cd77cff594196100f00b22e3c0220dbc09 100644 --- a/.github/ISSUE_TEMPLATE/400-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml @@ -8,6 +8,16 @@ body: attributes: value: > #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: markdown + attributes: + value: | + ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as: + - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys) + - Passwords or authentication credentials + - Private URLs or endpoints + - Personal or confidential data + + Consider redacting or replacing sensitive values with placeholders like `` when sharing configuration or code examples. - type: textarea attributes: label: Your current environment diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 65be771b94fb9a2791d7e61099248e7e532c8a9e..017ec7ca82da78302a551e3d10408bb1d2ceb41e 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,6 +1,18 @@ -FILL IN THE PR DESCRIPTION HERE +## Essential Elements of an Effective PR Description Checklist +- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". +- [ ] The test plan, such as providing test command. +- [ ] The test results, such as pasting the results comparison before and after, or e2e results +- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model. -FIX #xxxx (*link existing issues this PR will resolve*) +PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED. + +## Purpose + +## Test Plan + +## Test Result + +## (Optional) Documentation Update **BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) diff --git a/.github/mergify.yml b/.github/mergify.yml index e595060c325a5fe128738eb22dbff13fa383baa3..5692bb5d363d8013e09dd26c9e1534dfd9871a5a 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -36,6 +36,20 @@ pull_request_rules: add: - frontend +- name: label-llama + description: Automatically apply llama label + conditions: + - or: + - files~=^examples/.*llama.*\.py + - files~=^tests/.*llama.*\.py + - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py + - files~=^vllm/model_executor/models/.*llama.*\.py + - files~=^vllm/transformers_utils/configs/.*llama.*\.py + actions: + label: + add: + - llama + - name: label-multi-modality description: Automatically apply multi-modality label conditions: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b45619a3234cf490018ef46e5acfaeb0dda6da04..a105b0e14c4aff2330af3139cbcc050cb01b2bcc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,6 +11,8 @@ repos: hooks: - id: yapf args: [--in-place, --verbose] + # Keep the same list from yapfignore here to avoid yapf failing without any inputs + exclude: '(.buildkite|benchmarks|build|examples)/.*' - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.11.7 hooks: @@ -58,7 +60,7 @@ repos: entry: tools/mypy.sh 0 "local" language: python types: [python] - additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests] + additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic] stages: [pre-commit] # Don't run in CI - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.9 diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d222f84c9044fedf2c569663e625e5c1bdb9079..dbf0ca291dfb8acb28ea4d9542e9cc10dce191de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) # Suppress potential warnings about unused manually-specified variables set(ignoreMe "${VLLM_PYTHON_PATH}") +# Prevent installation of dependencies (cutlass) by default. +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) + # # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. @@ -179,9 +182,6 @@ include(FetchContent) file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") -# -# Set rocm version dev int. -# if(VLLM_GPU_LANG STREQUAL "HIP") # # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info @@ -189,7 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP") set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3") - # # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates # a lot of warnings that always mask real issues. Suppressing until this is properly addressed. @@ -244,6 +243,7 @@ set(VLLM_EXT_SRC "csrc/layernorm_kernels.cu" "csrc/opt/transpose_kernels.cu" # "csrc/layernorm_quant_kernels.cu" + "csrc/sampler.cu" "csrc/cuda_view.cu" "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" @@ -309,7 +309,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. # 9.0 for latest bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") if (MARLIN_ARCHS) # @@ -455,7 +455,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # kernels for the remaining archs that are not already built for 3x. # (Build 8.9 for FP8) cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS - "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}") + "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) if (SCALED_MM_2X_ARCHS) @@ -544,8 +544,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # CUTLASS MoE kernels # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works - # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible - # to compile MoE kernels that use its output. + # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled + # if it's possible to compile MoE kernels that use its output. cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu" @@ -685,7 +685,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") # 9.0 for latest bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}") if (MARLIN_MOE_ARCHS) # @@ -788,5 +788,7 @@ endif() # For CUDA we also build and ship some external projects. if (VLLM_GPU_LANG STREQUAL "CUDA") include(cmake/external_projects/flashmla.cmake) + + # vllm-flash-attn should be last as it overwrites some CMake functions include(cmake/external_projects/vllm_flash_attn.cmake) endif () \ No newline at end of file diff --git a/README.md b/README.md index 67f6b957ec55a4744d5b1447345c3f2edcbea590..ec16d758327d4ecde377c9e96f9fdc49fb4da705 100644 --- a/README.md +++ b/README.md @@ -58,8 +58,8 @@ vLLM is fast with: - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph -- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8. -- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. +- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8 +- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer - Speculative decoding - Chunked prefill @@ -72,14 +72,14 @@ vLLM is flexible and easy to use with: - Tensor parallelism and pipeline parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron. +- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron - Prefix caching support - Multi-LoRA support vLLM seamlessly supports most popular open-source models on HuggingFace, including: - Transformer-like LLMs (e.g., Llama) - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3) -- Embedding Models (e.g. E5-Mistral) +- Embedding Models (e.g., E5-Mistral) - Multi-modal LLMs (e.g., LLaVA) Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html). @@ -162,4 +162,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ## Media Kit -- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit). +- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit) diff --git a/SECURITY.md b/SECURITY.md index 47196a1f1221e216d90e23353f73371cf0b52773..6053cfb41f35b2a9511c2a64f68f7f0c15ee9c6f 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form --- +Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations. + Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. diff --git a/benchmarks/README.md b/benchmarks/README.md index ecab570bb31c4107dc1d022963abde2a71e15299..6f9fbb91cbd9110a36c7a6708c2a8d6cf3a50a35 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -64,6 +64,12 @@ become available. ✅ lmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered + + Custom + ✅ + ✅ + Local file: data.jsonl + @@ -124,6 +130,38 @@ P99 ITL (ms): 8.39 ================================================== ``` +### Custom Dataset +If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl + +``` +{"prompt": "What is the capital of India?"} +{"prompt": "What is the capital of Iran?"} +{"prompt": "What is the capital of China?"} +``` + +```bash +# start server +VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests +``` + +```bash +# run benchmarking script +python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ + --backend vllm \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --endpoint /v1/completions \ + --dataset-name custom \ + --dataset-path \ + --custom-skip-chat-template \ + --num-prompts 80 \ + --max-concurrency 1 \ + --temperature=0.3 \ + --top-p=0.75 \ + --result-dir "./log/" +``` + +You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. + ### VisionArena Benchmark for Vision Language Models ```bash @@ -146,9 +184,9 @@ python3 vllm/benchmarks/benchmark_serving.py \ ``` bash VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ - --ngram_prompt_lookup_min 2 \ - --ngram-prompt-lookup-max 5 \ - --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5} + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' ``` ``` bash @@ -203,6 +241,16 @@ python3 vllm/benchmarks/benchmark_serving.py \ --seed 42 ``` +**`philschmid/mt-bench`** + +``` bash +python3 vllm/benchmarks/benchmark_serving.py \ + --model Qwen/QwQ-32B \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 80 +``` + ### Running With Sampling Parameters When using OpenAI-compatible backends such as `vllm`, optional sampling @@ -273,9 +321,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \ --output-len=100 \ --num-prompts=2048 \ --async-engine \ - --ngram_prompt_lookup_min=2 \ - --ngram-prompt-lookup-max=5 \ - --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5} + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' ``` ``` diff --git a/benchmarks/auto_tune.sh b/benchmarks/auto_tune.sh index ea63c6f71a6c50ae698b0c9969d38da91896f728..1b01bbd61b628f0ed18041b8945d2e5f086951a4 100644 --- a/benchmarks/auto_tune.sh +++ b/benchmarks/auto_tune.sh @@ -10,11 +10,15 @@ # 3. Set variables (ALL REQUIRED) # BASE: your directory for vllm repo # MODEL: the model served by vllm +# TP: ways of tensor parallelism # DOWNLOAD_DIR: directory to download and load model weights. # INPUT_LEN: request input len # OUTPUT_LEN: request output len # MIN_CACHE_HIT_PCT: prefix cache rate # MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000 +# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with. +# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with. +# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST. # 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens. # 5. The final result will be saved in RESULT file. @@ -30,31 +34,27 @@ TAG=$(date +"%Y_%m_%d_%H_%M") BASE="" MODEL="meta-llama/Llama-3.1-8B-Instruct" +TP=1 DOWNLOAD_DIR="" INPUT_LEN=4000 OUTPUT_LEN=16 -MIN_CACHE_HIT_PCT_PCT=0 +MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 +NUM_SEQS_LIST="128 256" +NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096" LOG_FOLDER="$BASE/auto-benchmark/$TAG" RESULT="$LOG_FOLDER/result.txt" -echo "result file$ $RESULT" +echo "result file: $RESULT" echo "model: $MODEL" -echo rm -rf $LOG_FOLDER mkdir -p $LOG_FOLDER cd "$BASE/vllm" -# create sonnet-4x.txt so that we can sample 2048 tokens for input -echo "" > benchmarks/sonnet_4x.txt -for _ in {1..4} -do -cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt -done -pip install datasets +pip install -q datasets current_hash=$(git rev-parse HEAD) echo "hash:$current_hash" >> "$RESULT" @@ -64,53 +64,69 @@ best_throughput=0 best_max_num_seqs=0 best_num_batched_tokens=0 best_goodput=0 -run_benchmark() { - local max_num_seqs=$1 - local max_num_batched_tokens=$2 - echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" - local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" - echo "vllm_log: $vllm_log" - echo - rm -f $vllm_log - # start the server +start_server() { + local gpu_memory_utilization=$1 + local max_num_seqs=$2 + local max_num_batched_tokens=$3 + local vllm_log=$4 + + pkill -f vllm + VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \ --disable-log-requests \ --port 8004 \ - --gpu-memory-utilization 0.98 \ + --gpu-memory-utilization $gpu_memory_utilization \ --max-num-seqs $max_num_seqs \ --max-num-batched-tokens $max_num_batched_tokens \ - --tensor-parallel-size 1 \ + --tensor-parallel-size $TP \ --enable-prefix-caching \ --load-format dummy \ - --download-dir $DOWNLOAD_DIR \ + --download-dir "$DOWNLOAD_DIR" \ --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 & - echo "wait for 10 minutes.." - echo + # wait for 10 minutes... server_started=0 - for i in {1..60}; do - if grep -Fq "Application startup complete" "$vllm_log"; then - echo "Application started" + for i in {1..60}; do + RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) + STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) + if [[ "$STATUS_CODE" -eq 200 ]]; then server_started=1 break else - # echo "wait for 10 seconds..." sleep 10 fi done - if (( ! server_started )); then - echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log" - echo "pkill -f vllm" - echo - pkill vllm - sleep 10 + echo "server did not start within 10 minutes. Please check server log at $vllm_log". return 1 + else + return 0 fi +} + +run_benchmark() { + local max_num_seqs=$1 + local max_num_batched_tokens=$2 + local gpu_memory_utilization=$3 + echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" + local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" + echo "vllm_log: $vllm_log" + echo + rm -f $vllm_log + pkill -f vllm + + echo "starting server..." + start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log + result=$? + if [[ "$result" -eq 1 ]]; then + echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" + else + echo "server started." + fi + echo echo "run benchmark test..." - echo meet_latency_requirement=0 # get a basic qps by using request-rate inf bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" @@ -118,29 +134,29 @@ run_benchmark() { python benchmarks/benchmark_serving.py \ --backend vllm \ --model $MODEL \ - --dataset-name sonnet \ - --dataset-path benchmarks/sonnet_4x.txt \ - --sonnet-input-len $INPUT_LEN \ - --sonnet-output-len $OUTPUT_LEN \ + --dataset-name random \ + --random-input-len $INPUT_LEN \ + --random-output-len $OUTPUT_LEN \ --ignore-eos \ --disable-tqdm \ --request-rate inf \ --percentile-metrics ttft,tpot,itl,e2el \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ - --num-prompts 100 \ - --sonnet-prefix-len $prefix_len \ - --port 8004 > "$bm_log" - through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') + --num-prompts 1000 \ + --random-prefix-len $prefix_len \ + --port 8004 &> "$bm_log" + throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then meet_latency_requirement=1 + request_rate=inf fi if (( ! meet_latency_requirement )); then - # start from request-rate as int(through_put) + 1 - request_rate=$((${through_put%.*} + 1)) + # start from request-rate as int(throughput) + 1 + request_rate=$((${throughput%.*} + 1)) while ((request_rate > 0)); do # clear prefix cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache @@ -149,19 +165,18 @@ run_benchmark() { python benchmarks/benchmark_serving.py \ --backend vllm \ --model $MODEL \ - --dataset-name sonnet \ - --dataset-path benchmarks/sonnet_4x.txt \ - --sonnet-input-len $INPUT_LEN \ - --sonnet-output-len $OUTPUT_LEN \ - --ignore_eos \ + --dataset-name random \ + --random-input-len $INPUT_LEN \ + --random-output-len $OUTPUT_LEN \ + --ignore-eos \ --disable-tqdm \ --request-rate $request_rate \ --percentile-metrics ttft,tpot,itl,e2el \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --num-prompts 100 \ - --sonnet-prefix-len $prefix_len \ - --port 8004 > "$bm_log" - through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') + --random-prefix-len $prefix_len \ + --port 8004 &> "$bm_log" + throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then @@ -173,10 +188,10 @@ run_benchmark() { fi # write the results and update the best result. if ((meet_latency_requirement)); then - echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" - echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT" - if (( $(echo "$through_put > $best_throughput" | bc -l) )); then - best_throughput=$through_put + echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" + echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT" + if (( $(echo "$throughput > $best_throughput" | bc -l) )); then + best_throughput=$throughput best_max_num_seqs=$max_num_seqs best_num_batched_tokens=$max_num_batched_tokens best_goodput=$goodput @@ -188,22 +203,39 @@ run_benchmark() { echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" - echo "pkill -f vllm" - echo pkill vllm sleep 10 - rm -f $vllm_log printf '=%.0s' $(seq 1 20) return 0 } +read -r -a num_seqs_list <<< "$NUM_SEQS_LIST" +read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST" + +# first find out the max gpu-memory-utilization without HBM OOM. +gpu_memory_utilization=0.98 +find_gpu_memory_utilization=0 +while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do + start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" + result=$? + if [[ "$result" -eq 0 ]]; then + find_gpu_memory_utilization=1 + break + else + gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc) + fi +done + +if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then + echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model." +else + echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER." + exit 1 +fi -num_seqs_list="128 256" -num_batched_tokens_list="512 1024 2048 4096" -for num_seqs in $num_seqs_list; do - for num_batched_tokens in $num_batched_tokens_list; do - run_benchmark $num_seqs $num_batched_tokens - exit 0 +for num_seqs in "${num_seqs_list[@]}"; do + for num_batched_tokens in "${num_batched_tokens_list[@]}"; do + run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization done done echo "finish permutations" diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 88616e1108c52c51cf03f9ee211c347beb40767a..ddb38e304cd6565f5b5369c9555633fec7dfa373 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import io import json @@ -324,7 +325,7 @@ async def async_request_openai_completions( most_recent_timestamp = timestamp generated_text += text or "" - elif usage := data.get("usage"): + if usage := data.get("usage"): output.output_tokens = usage.get("completion_tokens") if first_chunk_received: output.success = True @@ -611,6 +612,7 @@ ASYNC_REQUEST_FUNCS = { "tensorrt-llm": async_request_trt_llm, "scalellm": async_request_openai_completions, "sglang": async_request_openai_completions, + "llama.cpp": async_request_openai_completions, } OPENAI_COMPATIBLE_BACKENDS = [ diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 5513a5f78f1ce219be5462d08cb03d7a6e2bebde..5d2a26cd443c0060eb5ed80ed44565812caa96c6 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This module defines a framework for sampling benchmark requests from various datasets. Each dataset subclass of BenchmarkDataset must implement sample @@ -9,9 +10,6 @@ generation. Supported dataset types include: - BurstGPT - HuggingFace - VisionArena - -TODO: Implement CustomDataset to parse a JSON file and convert its contents into -SampleRequest instances, similar to the approach used in ShareGPT. """ import base64 @@ -442,6 +440,97 @@ class ShareGPTDataset(BenchmarkDataset): return samples +# ----------------------------------------------------------------------------- +# Custom Dataset Implementation +# ----------------------------------------------------------------------------- + + +class CustomDataset(BenchmarkDataset): + """ + Implements the Custom dataset. Loads data from a JSONL file and generates + sample requests based on conversation turns. E.g., + ``` + {"prompt": "What is the capital of India?"} + {"prompt": "What is the capital of Iran?"} + {"prompt": "What is the capital of China?"} + ``` + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + # self.data will be a list of dictionaries + # e.g., [{"prompt": "What is the capital of India?"}, ...] + # This will be the standardized format which load_data() + # has to convert into depending on the filetype of dataset_path. + # sample() will assume this standardized format of self.data + self.data = [] + + # Load the JSONL file + if self.dataset_path.endswith(".jsonl"): + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True) + + # check if the JSONL file has a 'prompt' column + if "prompt" not in jsonl_data.columns: + raise ValueError("JSONL file must contain a 'prompt' column.") + + # Convert each row to a dictionary and append to self.data + # This will convert the DataFrame to a list of dictionaries + # where each dictionary corresponds to a row in the DataFrame. + # This is the standardized format we want for self.data + for _, row in jsonl_data.iterrows(): + self.data.append(row.to_dict()) + else: + raise NotImplementedError( + "Only JSONL format is supported for CustomDataset." + ) + + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + skip_chat_template: bool = False, + **kwargs, + ) -> list: + sampled_requests = [] + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item["prompt"] + + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + ) + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + + return sampled_requests + + # ----------------------------------------------------------------------------- # Sonnet Dataset Implementation # ----------------------------------------------------------------------------- @@ -776,7 +865,15 @@ class InstructCoderDataset(HuggingFaceDataset): for item in self.data: if len(sampled_requests) >= num_requests: break - prompt = f"{item['instruction']}:\n{item['input']}" + prompt = f"{item['input']}\n\n{item['instruction']} Just output \ + the code, do not include any explanation." + + # apply template + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) prompt_len = len(tokenizer(prompt).input_ids) sampled_requests.append( SampleRequest( diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 84759c5c354dc7754b09cf56286357af98dc6713..c06857247eeed9bbb61c437ff144d3bd9edba4b1 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark the latency of processing a single batch of requests.""" import argparse @@ -6,13 +7,12 @@ import dataclasses import json import os import time -from pathlib import Path from typing import Any, Optional import numpy as np -import torch from tqdm import tqdm +import vllm.envs as envs from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs @@ -80,17 +80,9 @@ def main(args: argparse.Namespace): def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir) - ), - ) as p: - llm_generate() - print(p.key_averages().table(sort_by="self_cuda_time_total")) + llm.start_profile() + llm_generate() + llm.stop_profile() else: start_time = time.perf_counter() llm_generate() @@ -103,11 +95,7 @@ def main(args: argparse.Namespace): run_to_completion(profile_dir=None) if args.profile: - profile_dir = args.profile_result_dir - if not profile_dir: - profile_dir = ( - Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}" - ) + profile_dir = envs.VLLM_TORCH_PROFILER_DIR print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return @@ -164,15 +152,6 @@ if __name__ == "__main__": action="store_true", help="profile the generation process of a single batch", ) - parser.add_argument( - "--profile-result-dir", - type=str, - default=None, - help=( - "path to save the pytorch profiler output. Can be visualized " - "with ui.perfetto.dev or Tensorboard." - ), - ) parser.add_argument( "--output-json", type=str, @@ -193,4 +172,9 @@ if __name__ == "__main__": # numbers. We need to disable prefix caching by default. parser.set_defaults(enable_prefix_caching=False) args = parser.parse_args() + if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: + raise OSError( + "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " + "Please set it to a valid path to use torch profiler." + ) main(args) diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 109624c877891c87d042abdbc5785e61b97a0924..00869fa94e71a78c8ec26f8d35d1ff67871584ee 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Offline benchmark to test the long document QA throughput. diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index ffaa8035797c10accebde3ab6e35356bd6db6292..3e4704f0b8205870914a0c1d7ffe100ece91e6ee 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Benchmark the efficiency of prefix caching. diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index a05dd24dece83d5c7a12130f56d4a82d5eb377f4..5496703f23ccbe368f4b8dbc20da5ad308109996 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark offline prioritization.""" import argparse diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index a887e7150dc78ad1f5ca03951b5482d170a36f54..81428fb7dae12a9d4a1f6a0755f9bff25e28a585 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project r"""Benchmark online serving throughput. On the server side, run one of the following commands: @@ -60,6 +61,7 @@ from benchmark_dataset import ( ASRDataset, BurstGPTDataset, ConversationDataset, + CustomDataset, HuggingFaceDataset, InstructCoderDataset, MTBenchDataset, @@ -627,7 +629,16 @@ def main(args: argparse.Namespace): "'--dataset-path' if required." ) - if args.dataset_name == "sonnet": + if args.dataset_name == "custom": + dataset = CustomDataset(dataset_path=args.dataset_path) + input_requests = dataset.sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.custom_output_len, + skip_chat_template=args.custom_skip_chat_template, + ) + + elif args.dataset_name == "sonnet": dataset = SonnetDataset(dataset_path=args.dataset_path) # For the "sonnet" dataset, formatting depends on the backend. if args.backend == "openai-chat": @@ -762,6 +773,10 @@ def main(args: argparse.Namespace): if "temperature" not in sampling_params: sampling_params["temperature"] = 0.0 # Default to greedy decoding. + if args.backend == "llama.cpp": + # Disable prompt caching in llama.cpp backend + sampling_params["cache_prompt"] = False + # Avoid GC processing "static" data - reduce pause times. gc.collect() gc.freeze() @@ -834,6 +849,8 @@ def main(args: argparse.Namespace): ]: if field in result_json: del result_json[field] + if field in benchmark_result: + del benchmark_result[field] # Save to file base_model_id = model_id.split("/")[-1] @@ -846,6 +863,7 @@ def main(args: argparse.Namespace): if args.result_filename: file_name = args.result_filename if args.result_dir: + os.makedirs(args.result_dir, exist_ok=True) file_name = os.path.join(args.result_dir, file_name) with open( file_name, mode="a+" if args.append_result else "w", encoding="utf-8" @@ -886,7 +904,7 @@ if __name__ == "__main__": "--dataset-name", type=str, default="sharegpt", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"], + choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], help="Name of the dataset to benchmark on.", ) parser.add_argument( @@ -1056,6 +1074,19 @@ if __name__ == "__main__": ) # group for dataset specific arguments + custom_group = parser.add_argument_group("custom dataset options") + custom_group.add_argument( + "--custom-output-len", + type=int, + default=256, + help="Number of output tokens per request, used only for custom dataset.", + ) + custom_group.add_argument( + "--custom-skip-chat-template", + action="store_true", + help="Skip applying chat template to prompt, used only for custom dataset.", + ) + sonnet_group = parser.add_argument_group("sonnet dataset options") sonnet_group.add_argument( "--sonnet-input-len", diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 6a50f47d3951cf7f9e33b69b152efe279aa9c6c7..c1501ad52c25af1f101785e4a2143e8a5e317489 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project r"""Benchmark online serving throughput with structured outputs. On the server side, run one of the following commands: @@ -11,7 +12,6 @@ On the client side, run: --model \ --dataset json \ --structured-output-ratio 1.0 \ - --structured-output-backend auto \ --request-rate 10 \ --num-prompts 1000 diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 7a13babda9d16227385e5bae3df37d5daed12b05..d19753d40e497d95c56ed1330a802d788b0d5ded 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark offline inference throughput.""" import argparse diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index b0c4fca92c3d0035904691d5a1fc9b99e741b7a0..283f938df50af101fa2ff82f3c46e123b25e5524 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json @@ -65,4 +66,9 @@ class InfEncoder(json.JSONEncoder): def write_to_json(filename: str, records: list) -> None: with open(filename, "w") as f: - json.dump(records, f, cls=InfEncoder) + json.dump( + records, + f, + cls=InfEncoder, + default=lambda o: f"<{type(o).__name__} object is not JSON serializable>", + ) diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index da258f98e085f973110dc623111012f5f6b61b93..9ec270bbd2e988b459f99d876215984bf125d0f9 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py index 7e9f5a7fc0f464718e15e1cc024df89081da74f9..b4f3c6bf94eda0e1bf1d253def3b17d273415dcc 100644 --- a/benchmarks/cutlass_benchmarks/utils.py +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Cutlass bench utils from collections.abc import Iterable diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 08e93837f7ddff3da18c37c482e527695dff75b2..cec422e8d597f1df353eb5a4836c8f88f1685c2a 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py index d31b623a1ee604c2bf0b4ab7cb90d37ffa463adb..25b96ef56620ea7dbd97846cdd57ab0e97a6dfd1 100644 --- a/benchmarks/cutlass_benchmarks/weight_shapes.py +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index fce156e1c96c62bf0938facd623e40f6fc2a22a3..f62d8102e2d9f29c62f177d62453be5d8496cc1e 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py index fd19b40bf252c3076f317b5482e439de2c01ebbb..b1df2f255822dad046f5dfcdc1d6538006463510 100644 --- a/benchmarks/disagg_benchmarks/round_robin_proxy.py +++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import itertools diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py index 484d0cb3cba7d74db8dea04e68e17dde38be25e6..74fa56d076cf14bc066468be24f6053b45166001 100644 --- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py +++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index 37a9173a1a937808271cc832ee49fe4172474601..901524214469e8677cbb58c3078e8b1d724585bb 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle as pkl import time diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py new file mode 100644 index 0000000000000000000000000000000000000000..b964ed242edf8f16a8fd63ab27c098d11c9663f4 --- /dev/null +++ b/benchmarks/kernels/bench_fp8_gemm.py @@ -0,0 +1,223 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import copy +import itertools + +import torch +from weight_shapes import WEIGHT_SHAPES + +from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm +from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant +from vllm.triton_utils import triton + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=[ + "torch-bf16", + # "fp8-tensor-w-token-a", + "fp8-tensor-w-tensor-a", + "fp8-channel-w-token-a", + # "fp8-channel-w-tensor-a", + # "fp8-tensor-w-token-a-noquant", + "fp8-tensor-w-tensor-a-noquant", + "fp8-channel-w-token-a-noquant", + # "fp8-channel-w-tensor-a-noquant", + ], + line_names=[ + "torch-bf16", + # "fp8-tensor-w-token-a", + "fp8-tensor-w-tensor-a", + "fp8-channel-w-token-a", + # "fp8-channel-w-tensor-a", + # "fp8-tensor-w-token-a-noquant", + "fp8-tensor-w-tensor-a-noquant", + "fp8-channel-w-token-a-noquant", + # "fp8-channel-w-tensor-a-noquant", + ], + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs FP8 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + # Create input tensors + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + + quantiles = [0.5, 0.2, 0.8] + + if "torch-bf16" in provider: + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + + elif "fp8" in provider: + # Weights are always quantized ahead of time + if "noquant" in provider: + # For no quantization, we just measure the GEMM + if "tensor-w-token-a" in provider: + # Dynamic per-token quant for A, per-tensor quant for B + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b) + assert scale_b_fp8.numel() == 1 + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "tensor-w-tensor-a" in provider: + # Static per-tensor quantization with fixed scales + # for both A and B + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor([1.0], device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + assert scale_b_fp8.numel() == 1 + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-token-a" in provider: + # Static per-channel quantization for weights, per-token + # quant for A + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-tensor-a" in provider: + # Static per-channel quantization for weights, per-tensor + # quant for A + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + else: + # In these cases, we quantize the activations during the GEMM call + if "tensor-w-token-a" in provider: + # Dynamic per-token quant for A, per-tensor quant for B + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b) + assert scale_b_fp8.numel() == 1 + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "tensor-w-tensor-a" in provider: + # Static per-tensor quantization with fixed scales + # for both A and B + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor([1.0], device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + assert scale_b_fp8.numel() == 1 + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-token-a" in provider: + # Static per-channel quantization for weights, per-token + # quant for A + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-tensor-a" in provider: + # Static per-channel quantization for weights, per-tensor + # quant for A + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + b_fp8 = b_fp8.t() + + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), quantiles=quantiles + ) + + # Calculate TFLOP/s, two flops per multiply-add + tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3) + return tflops(ms), tflops(max_ms), tflops(min_ms) + + +def prepare_shapes(args): + KN_model_names = [] + models_tps = list(itertools.product(args.models, args.tp_sizes)) + for model, tp_size in models_tps: + assert model in WEIGHT_SHAPES + for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + KN.append(model) + KN_model_names.append(KN) + return KN_model_names + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=[*WEIGHT_SHAPES.keys()], + help="List of models to benchmark", + ) + parser.add_argument( + "--tp-sizes", + nargs="+", + type=int, + default=[1], + help="List of tensor parallel sizes", + ) + args = parser.parse_args() + + KN_model_names = prepare_shapes(args) + for K, N, model_name in KN_model_names: + print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:") + benchmark.run( + print_data=True, + show_plots=True, + save_path=f"bench_fp8_res_n{N}_k{K}", + N=N, + K=K, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index e9934aa479dd6bb21d0e6e45d5e15999c0d23dda..42de062b08e424619b010f9170725b6785a35f58 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py index d40ab70ec539b27af09739167a7c11900e6e936d..97ee060341373022b06c6ce62600d5c23c771c37 100644 --- a/benchmarks/kernels/benchmark_bitblas.py +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py index d39d8a6e3aba31612817e62f50ed8bb911fb66e5..35c20ee41b9a94c10f1367a45fe391cc15e9ffec 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit @@ -90,7 +91,7 @@ def bench_run( score = torch.randn((m, num_experts), device=device, dtype=dtype) - topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False) + topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False) quant_blocksize = 16 w1_blockscale = torch.empty( diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 2197bceabe6c034eb591fae5308e7d483016c317..acabe6c1ddb0a18aac64cef4326fb14220fe4858 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.utils.benchmark as benchmark @@ -6,8 +7,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 from vllm.model_executor.layers.fused_moe.fused_moe import ( - cutlass_moe_fp8, fused_experts, fused_topk, ) @@ -69,18 +70,9 @@ def bench_run( w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32) - ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) - c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64) - ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64) - c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64) - for expert in range(num_experts): w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert]) w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert]) - w1_q_notransp = w1_q.clone() - w2_q_notransp = w2_q.clone() - w1_q = w1_q.transpose(1, 2) - w2_q = w2_q.transpose(1, 2) score = torch.randn((m, num_experts), device="cuda", dtype=dtype) @@ -121,10 +113,6 @@ def bench_run( w2_scale: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - ab_strides1: torch.Tensor, - c_strides1: torch.Tensor, - ab_strides2: torch.Tensor, - c_strides2: torch.Tensor, num_repeats: int, ): for _ in range(num_repeats): @@ -132,14 +120,10 @@ def bench_run( a, w1, w2, - w1_scale, - w2_scale, topk_weights, topk_ids, - ab_strides1, - c_strides1, - ab_strides2, - c_strides2, + w1_scale, + w2_scale, a1_scale=a_scale, ) @@ -152,10 +136,6 @@ def bench_run( w2_scale: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - ab_strides1: torch.Tensor, - c_strides1: torch.Tensor, - ab_strides2: torch.Tensor, - c_strides2: torch.Tensor, ): with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) @@ -164,14 +144,10 @@ def bench_run( a, w1_q, w2_q, - w1_scale, - w2_scale, topk_weights, topk_ids, - ab_strides1, - c_strides1, - ab_strides2, - c_strides2, + w1_scale, + w2_scale, a1_scale=a_scale, ) @@ -217,10 +193,6 @@ def bench_run( w2_scale, topk_weights, topk_ids, - ab_strides1, - c_strides1, - ab_strides2, - c_strides2, ) torch.cuda.synchronize() @@ -229,8 +201,8 @@ def bench_run( with torch.cuda.graph(triton_graph, stream=triton_stream): run_triton_from_graph( a, - w1_q_notransp, - w2_q_notransp, + w1_q, + w2_q, topk_weights, topk_ids, w1_scale, @@ -249,18 +221,12 @@ def bench_run( "w2": w2, "score": score, "topk": topk, - "w1_q_notransp": w1_q_notransp, - "w2_q_notransp": w2_q_notransp, # Cutlass params "a_scale": a_scale, "w1_q": w1_q, "w2_q": w2_q, "w1_scale": w1_scale, "w2_scale": w2_scale, - "ab_strides1": ab_strides1, - "c_strides1": c_strides1, - "ab_strides2": ab_strides2, - "c_strides2": c_strides2, # cuda graph params "cutlass_graph": cutlass_graph, "triton_graph": triton_graph, @@ -278,8 +244,8 @@ def bench_run( # Warmup run_triton_moe( a, - w1_q_notransp, - w2_q_notransp, + w1_q, + w2_q, topk_weights, topk_ids, w1_scale, @@ -290,7 +256,7 @@ def bench_run( results.append( benchmark.Timer( - stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501 + stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, @@ -321,16 +287,12 @@ def bench_run( w2_scale, topk_weights, topk_ids, - ab_strides1, - c_strides1, - ab_strides2, - c_strides2, num_warmup, ) results.append( benchmark.Timer( - stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501 + stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index f21ca97eeb8a9b2abbf04f979f90ece9003f1cef..69978ec6b23e94ce21d8f3dfb633317b3803e6ef 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 6c1284930c1ec3f6963dd9ec325e72179d14da5f..3d38d4b3534e8993abc8aff85712230e7262a740 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index f8f1db04790bfb714751ef29da3391440a01e378..0f896f187ecb9783b18036779d9ebf56a1783a30 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index b17baff2e5f5d36042e4737454b75c4dd5868cd6..9ea1fddae2a3b0b7ff595a70c0c61f9355f0ff41 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.utils.benchmark as benchmark diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c2f7660858f574791d1477d69f936b354eba6f20..cef53b183cef3bd9afadc2f45bbf9624c58e6ba0 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json @@ -6,7 +7,6 @@ import time from contextlib import nullcontext from datetime import datetime from itertools import product -from types import SimpleNamespace from typing import Any, TypedDict import ray @@ -42,7 +42,7 @@ def benchmark_config( use_fp8_w8a8: bool, use_int8_w8a16: bool, num_iters: int = 100, - block_quant_shape: List[int] = None, + block_quant_shape: list[int] = None, use_deep_gemm: bool = False, ) -> float: init_dtype = torch.float16 if use_fp8_w8a8 else dtype @@ -399,7 +399,7 @@ class BenchmarkWorker: dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - block_quant_shape: List[int] = None, + block_quant_shape: list[int] = None, use_deep_gemm: bool = False, ) -> tuple[dict[str, int], float]: current_platform.seed_everything(self.seed) @@ -531,7 +531,7 @@ def save_configs( dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - block_quant_shape: List[int], + block_quant_shape: list[int], ) -> None: dtype_str = get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 @@ -562,7 +562,6 @@ def main(args: argparse.Namespace): config = get_config(model=args.model, trust_remote_code=args.trust_remote_code) if args.model_prefix: config = getattr(config, args.model_prefix) - config = SimpleNamespace(**config) if config.architectures[0] == "DbrxForCausalLM": E = config.ffn_config.moe_num_experts @@ -594,11 +593,7 @@ def main(args: argparse.Namespace): shard_intermediate_size = 2 * intermediate_size // args.tp_size hidden_size = config.hidden_size - dtype = ( - torch.float16 - if current_platform.is_rocm() - else getattr(torch, config.torch_dtype) - ) + dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" block_quant_shape = get_weight_block_size_safety(config) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 333986fdf5eff52574194bbea7c51a8000c37424..dba1f3943b96c370acf1262273695dbdb4097711 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from typing import Any, TypedDict diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 54f05e72322654ff432dfae2c5d8bb52e4ce6ef1..7e0376c18ecc79690ca49e3995258c5e749bbac7 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random import time diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 2463dfebe83cce8511a68775e4a4666a6607f892..6ab26f5f1adf73ccdc68c0d5f20d622d0c09b7ba 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index d720083b615037d9f0236086cf42d392a3494b00..4cf633a81358d80202fa383d7312dac4a818ea62 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from typing import Optional, Union diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 110d36db157fdf70afa3a2dde76e8c31cfab6922..b81baf17a8c674edd72fe9a6397a0b027a8d33b9 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import accumulate from typing import Optional @@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora( seed: int, device: str, max_position: int = 8192, - base: int = 10000, + base: float = 10000, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py index 70190ba24d9dffbbe05a4e2107a89d020de557b0..18c459c31d3f84d1ca7b7e31d00d50b521177e96 100644 --- a/benchmarks/kernels/benchmark_shapes.py +++ b/benchmarks/kernels/benchmark_shapes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project WEIGHT_SHAPES = { "ideal": [[4 * 256 * 32, 256 * 32]], diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 6315c1ee6cdd6893e9d5e513c7dadc79f228f7d3..4fcdbadd65ecd366f51ce5d9053cc700854dcb71 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from sglang quantization/tuning_block_wise_kernel.py import argparse diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index e377648254512dab59b6b97b678a52872cdc2b22..e67ce054531818d6d0b59a21c0bd192c4290763a 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # fmt: off # ruff: noqa: E501 import time diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index 0c86e40729579994ed54432dbfbbedf15b3ba2d6..9a4da0ef5a85d0df2178d5e990ad177e3f76bf37 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import pickle diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index 877a29feed9dfe226a28baaac2d4d5a72ef23943..4bbb36bb4359259944310adb8e1047a02f126dea 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from collections.abc import Iterable diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py index 89b05d5882a381ce789230a64672811382ebdb8b..a27f02394afbdfa4f3a352a599db349a2601bea5 100644 --- a/benchmarks/kernels/weight_shapes.py +++ b/benchmarks/kernels/weight_shapes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) @@ -48,4 +49,50 @@ WEIGHT_SHAPES = { ([16384, 106496], 1), ([53248, 16384], 0), ], + "meta-llama/Llama-3.1-8B-Instruct": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-3.3-70B-Instruct": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 57344], 1), + ([28672, 8192], 0), + ], + "mistralai/Mistral-Large-Instruct-2407": [ + ([12288, 14336], 1), + ([12288, 12288], 0), + ([12288, 57344], 1), + ([28672, 12288], 0), + ], + "Qwen/Qwen2.5-7B-Instruct": [ + ([3584, 4608], 1), + ([3584, 3584], 0), + ([3584, 37888], 1), + ([18944, 3584], 0), + ], + "Qwen/Qwen2.5-32B-Instruct": [ + ([5120, 7168], 1), + ([5120, 5120], 0), + ([5120, 55296], 1), + ([27648, 5120], 0), + ], + "Qwen/Qwen2.5-72B-Instruct": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 59136], 1), + ([29568, 8192], 0), + ], + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [ + ([2048, 3072], 1), + ([2048, 4096], 1), + ([2048, 2048], 0), + ([2048, 576], 0), + ([2048, 21888], 1), + ([10944, 2048], 0), + ([2048, 2816], 1), + ([1408, 2048], 0), + ], } diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index d5701a8fbd6d85f75c4830ca379590306fe59eae..0957a9c65f06c912ae73588e017602af72ae9772 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import cProfile import pstats diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index fb763db9fc359ef9dbe96b688fe914e36245011d..5cd2c98f234381818063d7947a05ca28cb544129 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -75,6 +75,7 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") else() find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND) + find_isa(${CPUINFO} "Power11" POWER11_FOUND) find_isa(${CPUINFO} "POWER10" POWER10_FOUND) find_isa(${CPUINFO} "POWER9" POWER9_FOUND) find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support @@ -106,13 +107,19 @@ elseif (AVX2_FOUND) list(APPEND CXX_COMPILE_FLAGS "-mavx2") message(WARNING "vLLM CPU backend using AVX2 ISA") -elseif (POWER9_FOUND OR POWER10_FOUND) +elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) message(STATUS "PowerPC detected") - # Check for PowerPC VSX support - list(APPEND CXX_COMPILE_FLAGS - "-mvsx" - "-mcpu=native" - "-mtune=native") + if (POWER9_FOUND) + list(APPEND CXX_COMPILE_FLAGS + "-mvsx" + "-mcpu=power9" + "-mtune=power9") + elseif (POWER10_FOUND OR POWER11_FOUND) + list(APPEND CXX_COMPILE_FLAGS + "-mvsx" + "-mcpu=power10" + "-mtune=power10") + endif() elseif (ASIMD_FOUND) message(STATUS "ARMv8 or later architecture detected") diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index b04e4c2d06edc90b443d91e34491a0f67431bf67..a4edd5b96fe29ccfece190016aad0cb6c7d283d6 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -46,22 +46,38 @@ else() endif() +# Ensure the vllm/vllm_flash_attn directory exists before installation +install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS) + +# Make sure vllm-flash-attn install rules are nested under vllm/ +# This is here to support installing all components under the same prefix with cmake --install. +# setup.py installs every component separately but uses the same prefix for all. +# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3, +# and these statements don't hurt when installing neither component. +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS) +install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS) +install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS) + # Fetch the vllm-flash-attn library FetchContent_MakeAvailable(vllm-flash-attn) message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") +# Restore the install prefix +install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS) +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) + # Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in # case only one is built, in the case both are built redundant work is done) install( DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm_flash_attn + DESTINATION vllm/vllm_flash_attn COMPONENT _vllm_fa2_C FILES_MATCHING PATTERN "*.py" ) install( DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm_flash_attn + DESTINATION vllm/vllm_flash_attn COMPONENT _vllm_fa3_C FILES_MATCHING PATTERN "*.py" ) diff --git a/cmake/hipify.py b/cmake/hipify.py index a15577125eb1fef3584fe3ce11594dd353dd2071..55d378f5b11137739d21beb6d80df672040f66a4 100755 --- a/cmake/hipify.py +++ b/cmake/hipify.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # # A command line tool for running pytorch's hipify preprocessor on CUDA diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 375d254ba343ff9c0e968cb8c7984e1a56e0dd6a..8002dd74477b6f2d0a4c3e3d64bb7edf8128c635 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) add_custom_target( hipify${NAME} - COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS} BYPRODUCTS ${HIP_SRCS} COMMENT "Running hipify on ${NAME} extension source files.") diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu index 6743af0cf2dbab816b4c204320ebf484ea8516fb..f4b6b19f4b232c8bfd9e66ecf693a0dbfa3a1068 100644 --- a/csrc/attention/mla/cutlass_mla_kernels.cu +++ b/csrc/attention/mla/cutlass_mla_kernels.cu @@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options( {static_cast(out.data_ptr()), stride_O, static_cast(nullptr), stride_LSE}, hw_info, - -1, // split_kv + 1, // split_kv nullptr, // is_var_split_kv }; // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index d64f0d0a5c2a046bcf3210aab83bf1d6f9380232..1dd7101acc27ded603cd1999104e599571345308 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum from typing import Union diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py index 15f008d4f61ed66ad7e0df5643796ed6176b2af5..49f33718a21e815f8219606bc11952f1ab0fed02 100644 --- a/csrc/moe/marlin_moe_wna16/generate_kernels.py +++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob import itertools import os diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index 8fda434d452f9bfa085f36d0d22eb62f0af24185..c4faef731060a6d60fdfacb4d5f307bcf8d4e452 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -30,4 +30,8 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, int64_t BLOCK_SIZE_K, int64_t bit); #endif -bool moe_permute_unpermute_supported(); \ No newline at end of file +bool moe_permute_unpermute_supported(); + +void shuffle_rows(const torch::Tensor& input_tensor, + const torch::Tensor& dst2src_map, + torch::Tensor& output_tensor); \ No newline at end of file diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu index 9a7465261abfeb4ab1f36bf8bce86e570193c034..68f429fac18ab892bd8a7ebc7369841512f96773 100644 --- a/csrc/moe/moe_permute_unpermute_op.cu +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -130,6 +130,62 @@ void moe_unpermute( }); } +template +__global__ void shuffleInputRowsKernel(const T* input, + const int32_t* dst2src_map, T* output, + int64_t num_src_rows, + int64_t num_dst_rows, int64_t num_cols) { + int64_t dest_row_idx = blockIdx.x; + int64_t const source_row_idx = dst2src_map[dest_row_idx]; + + if (blockIdx.x < num_dst_rows) { + // Load 128-bits per thread + constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8; + using DataElem = cutlass::Array; + + // Duplicate and permute rows + auto const* source_row_ptr = + reinterpret_cast(input + source_row_idx * num_cols); + auto* dest_row_ptr = + reinterpret_cast(output + dest_row_idx * num_cols); + + int64_t const start_offset = threadIdx.x; + int64_t const stride = blockDim.x; + int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD; + + for (int elem_index = start_offset; elem_index < num_elems_in_col; + elem_index += stride) { + dest_row_ptr[elem_index] = source_row_ptr[elem_index]; + } + } +} + +void shuffle_rows(const torch::Tensor& input_tensor, + const torch::Tensor& dst2src_map, + torch::Tensor& output_tensor) { + TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(), + "Input and output tensors must have the same data type"); + + auto stream = at::cuda::getCurrentCUDAStream().stream(); + int64_t const blocks = output_tensor.size(0); + int64_t const threads = 256; + int64_t const num_dest_rows = output_tensor.size(0); + int64_t const num_src_rows = input_tensor.size(0); + int64_t const num_cols = input_tensor.size(1); + + TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)), + "num_cols must be divisible by 128 / " + "sizeof(input_tensor.scalar_type()) / 8"); + + MOE_DISPATCH(input_tensor.scalar_type(), [&] { + shuffleInputRowsKernel<<>>( + reinterpret_cast(input_tensor.data_ptr()), + dst2src_map.data_ptr(), + reinterpret_cast(output_tensor.data_ptr()), num_src_rows, + num_dest_rows, num_cols); + }); +} + #else void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights, diff --git a/csrc/moe/permute_unpermute_kernels/dispatch.h b/csrc/moe/permute_unpermute_kernels/dispatch.h index 41932cdd85bcd35b1623943695d05c6935cc6038..d0f1ea4aded3388353baaf9bb5ef49b893363002 100644 --- a/csrc/moe/permute_unpermute_kernels/dispatch.h +++ b/csrc/moe/permute_unpermute_kernels/dispatch.h @@ -14,12 +14,13 @@ __VA_ARGS__(); \ break; \ } -#define MOE_DISPATCH_FLOAT_CASE(...) \ - MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ - MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ - MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \ - MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) +#define MOE_DISPATCH_FLOAT_CASE(...) \ + MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) #define MOE_DISPATCH(TYPE, ...) \ MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__)) @@ -39,6 +40,11 @@ template <> struct ScalarType2CudaType { using type = __nv_bfloat16; }; +// uint8 for packed fp4 +template <> +struct ScalarType2CudaType { + using type = uint8_t; +}; // #if __CUDA_ARCH__ >= 890 // fp8 diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index a9379032245d9b74838ff30398cf796a6568ca72..10be47966f61189e995d45a8010a126edbfcc34c 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -516,9 +516,8 @@ void topk_softmax( topk, stream); } - else + else if (topk_indices.scalar_type() == at::ScalarType::UInt32) { - assert(topk_indices.scalar_type() == at::ScalarType::UInt32); vllm::moe::topkGatingSoftmaxKernelLauncher( gating_output.data_ptr(), topk_weights.data_ptr(), @@ -530,4 +529,17 @@ void topk_softmax( topk, stream); } + else { + assert(topk_indices.scalar_type() == at::ScalarType::Int64); + vllm::moe::topkGatingSoftmaxKernelLauncher( + gating_output.data_ptr(), + topk_weights.data_ptr(), + topk_indices.data_ptr(), + token_expert_indices.data_ptr(), + softmax_workspace.data_ptr(), + num_tokens, + num_experts, + topk, + stream); + } } diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index 7d35ec79ead487ad0689fff3a4e19a0d25043635..a74eb3720cf1cf48433813da0a21ab0d79c87521 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -81,6 +81,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { m.def("moe_permute_unpermute_supported() -> bool"); m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported); + // Row shuffle for MoE + m.def( + "shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! " + "output_tensor) -> ()"); + m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows); + #endif } diff --git a/csrc/ops.h b/csrc/ops.h index fe38e83ffbeb0b207813402a0858de0a713155da..6b3d50ae8bfd85ae22286814a8e6a8b01da2a297 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, torch::Tensor& weight, double epsilon); +void apply_repetition_penalties_(torch::Tensor& logits, + const torch::Tensor& prompt_mask, + const torch::Tensor& output_mask, + const torch::Tensor& repetition_penalties); + // void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, // torch::Tensor& weight, torch::Tensor& scale, // double epsilon); @@ -233,7 +238,8 @@ void cutlass_moe_mm( torch::Tensor const& b_tensors, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& expert_offsets, torch::Tensor const& problem_sizes, torch::Tensor const& a_strides, - torch::Tensor const& b_strides, torch::Tensor const& c_strides); + torch::Tensor const& b_strides, torch::Tensor const& c_strides, + bool per_act_token, bool per_out_ch); void cutlass_fp4_group_mm( torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b, @@ -245,7 +251,16 @@ void get_cutlass_moe_mm_data( const torch::Tensor& topk_ids, torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, torch::Tensor& input_permutation, torch::Tensor& output_permutation, - const int64_t num_experts, const int64_t n, const int64_t k); + const int64_t num_experts, const int64_t n, const int64_t k, + const std::optional& blockscale_offsets); + +void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, + torch::Tensor& problem_sizes1, + torch::Tensor& problem_sizes2, + const torch::Tensor& expert_num_tokens, + const int64_t num_local_experts, + const int64_t padded_m, const int64_t n, + const int64_t k); void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu index 84492553c02f2177e3fa81da1033fcb612e6f98c..4a8a5ed02d6ce454b33e2efbd353cfccfb98937e 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu @@ -9,10 +9,6 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { - TORCH_CHECK( - a.size(0) % 4 == 0, - "Input tensor must have a number of rows that is a multiple of 4. ", - "but got: ", a.size(0), " rows."); if (out.dtype() == torch::kBFloat16) { cutlass_gemm_blockwise_sm100_fp8_dispatch( out, a, b, a_scales, b_scales); diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh index ef324364c6d5e01cc3f32222f07cfe0fdd20f589..c841125dbb734f31d97f64d2d94c73f841a7bae1 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh @@ -1,5 +1,6 @@ #pragma once +#include "cuda_utils.h" #include "cutlass/cutlass.h" #include "cutlass/numeric_types.h" @@ -22,49 +23,49 @@ namespace vllm { using namespace cute; -template +// clang-format off +template struct cutlass_3x_gemm_fp8_blockwise { + static constexpr bool swap_ab = swap_ab_; using ElementAB = cutlass::float_e4m3_t; using ElementA = ElementAB; using LayoutA = cutlass::layout::RowMajor; + using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose::type; static constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; using ElementB = ElementAB; using LayoutB = cutlass::layout::ColumnMajor; + using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose::type; static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; - using ElementC = void; using ElementD = OutType; using LayoutD = cutlass::layout::RowMajor; + using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose::type; static constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; + using ElementC = void; // TODO: support bias using LayoutC = LayoutD; + using LayoutC_Transpose = LayoutD_Transpose; static constexpr int AlignmentC = AlignmentD; using ElementAccumulator = float; using ElementCompute = float; using ElementBlockScale = float; - // MMA and Cluster Tile Shapes - // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster - // Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>; - static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{}); - static constexpr int ScaleGranularityM = - size<0>(MmaTileShape{}) / ScaleMsPerTile; - static constexpr int ScaleGranularityN = - size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{}); - static constexpr int ScaleGranularityK = - size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{}); - - // Shape of the threadblocks in a cluster - using ClusterShape_MNK = ClusterShape; - - using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig< - ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, - cute::UMMA::Major::MN, cute::UMMA::Major::K>; + using ScaleConfig = conditional_t, + cutlass::detail::Sm100BlockwiseScaleConfig< + ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, + cute::UMMA::Major::MN, cute::UMMA::Major::K>>; + + // layout_SFA and layout_SFB cannot be swapped since they are deduced. using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); @@ -73,7 +74,6 @@ struct cutlass_3x_gemm_fp8_blockwise { static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest; using ElementScalar = float; - // clang-format off using DefaultOperation = cutlass::epilogue::fusion::LinearCombination; using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< ArchTag, @@ -84,33 +84,47 @@ struct cutlass_3x_gemm_fp8_blockwise { ElementAccumulator, ElementCompute, ElementC, - LayoutC, + conditional_t, AlignmentC, ElementD, - LayoutD, + conditional_t, AlignmentD, EpilogueScheduler, DefaultOperation >::CollectiveOp; using StageCountType = cutlass::gemm::collective::StageCountAuto; - using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder< - ArchTag, - OperatorClass, - ElementA, - cute::tuple, - AlignmentA, - ElementB, - cute::tuple, - AlignmentB, - ElementAccumulator, - MmaTileShape, - ClusterShape, - + using CollectiveMainloop = conditional_t, + AlignmentB, + ElementA, + cute::tuple, + AlignmentA, + ElementAccumulator, + MmaTileShape, + ClusterShape, cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>, - MainloopScheduler - >::CollectiveOp; - // clang-format on + MainloopScheduler + >::CollectiveOp, + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, + OperatorClass, + ElementA, + cute::tuple, + AlignmentA, + ElementB, + cute::tuple, + AlignmentB, + ElementAccumulator, + MmaTileShape, + ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>, + MainloopScheduler + >::CollectiveOp>; using KernelType = enable_sm100_only, CollectiveMainloop, CollectiveEpilogue>>; @@ -123,6 +137,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { + static constexpr bool swap_ab = Gemm::swap_ab; using GemmKernel = typename Gemm::GemmKernel; using StrideA = typename Gemm::GemmKernel::StrideA; using StrideB = typename Gemm::GemmKernel::StrideB; @@ -136,7 +151,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, using ElementD = typename Gemm::ElementD; int32_t m = a.size(0), n = b.size(1), k = a.size(1); - auto prob_shape = cute::make_shape(m, n, k, 1); StrideA a_stride; StrideB b_stride; @@ -146,11 +160,13 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, b_stride = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1)); c_stride = - cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1)); + cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1)); - LayoutSFA layout_SFA = + LayoutSFA layout_SFA = swap_ab ? + ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) : ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1)); - LayoutSFB layout_SFB = + LayoutSFB layout_SFB = swap_ab ? + ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) : ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); auto a_ptr = static_cast(a.data_ptr()); @@ -158,9 +174,22 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, auto a_scales_ptr = static_cast(a_scales.data_ptr()); auto b_scales_ptr = static_cast(b_scales.data_ptr()); - typename GemmKernel::MainloopArguments mainloop_args{ - a_ptr, a_stride, b_ptr, b_stride, - a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB}; + auto mainloop_args = [&](){ + // layout_SFA and layout_SFB cannot be swapped since they are deduced. + if (swap_ab) { + return typename GemmKernel::MainloopArguments{ + b_ptr, b_stride, a_ptr, a_stride, + b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB + }; + } + else { + return typename GemmKernel::MainloopArguments{ + a_ptr, a_stride, b_ptr, b_stride, + a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB + }; + } + }(); + auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1); auto c_ptr = static_cast(out.data_ptr()); typename GemmKernel::EpilogueArguments epilogue_args{ @@ -175,29 +204,74 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { - auto m = a.size(0); - auto k = a.size(1); - auto n = b.size(1); - int sms; + int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms; cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device()); - auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) { - return std::ceil(static_cast(m) / tile1SM) * - std::ceil(static_cast(n) / tile1SM) >= - sms; - }; - bool use_2sm = should_use_2sm(m, n); - if (use_2sm) { - cutlass_gemm_caller_blockwise, Shape<_256, _1, _1>, - Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm, - cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( - out, a, b, a_scales, b_scales); + constexpr int TILE_K = 128; + // TODO: better heuristics + bool swap_ab = (m < 16) || (m % 4 != 0); + bool use_tma_epilogue = (m * n) % 4 == 0; + if (!swap_ab) { + constexpr int TILE_N = 128; + int tile_m = 256; + if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) { + tile_m = 64; + } + else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) { + tile_m = 128; + } + if (tile_m == 64) { + if (use_tma_epilogue) { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } + } else if (tile_m == 128) { + if (use_tma_epilogue) { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } + } else { // tile_m == 256 + if (use_tma_epilogue) { + cutlass_gemm_caller_blockwise, Int>, + Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise, Int>, + Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( + out, a, b, a_scales, b_scales); + } + } } else { + // TODO: Test more tile N configs + constexpr int TILE_M = 128; + constexpr int TILE_N = 16; + // TMA epilogue isn't compatible with Swap A/B cutlass_gemm_caller_blockwise, Shape<_128, _1, _1>, - Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm, - cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + OutType, TILE_M, 1, TILE_K, Shape, Int, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>( out, a, b, a_scales, b_scales); } } diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh index 468b77d9593bc5ca42cde31f7866b7ff3f69e85e..6da2da63407590b9a4bd80dac08db1552e764609 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh @@ -15,6 +15,7 @@ using c3x::cutlass_gemm_caller; template typename Epilogue> struct sm100_fp8_config_default { + // M in (128, inf) static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; @@ -25,6 +26,34 @@ struct sm100_fp8_config_default { KernelSchedule, EpilogueSchedule>; }; +template typename Epilogue> +struct sm100_fp8_config_M128 { + // M in (64, 128] + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_128, _128, _64>; + using ClusterShape = Shape<_2, _2, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm_sm100; +}; + +template typename Epilogue> +struct sm100_fp8_config_M64 { + // M in [1, 64] + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_64, _64, _256>; + using ClusterShape = Shape<_1, _8, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm_sm100; +}; + template typename Epilogue, typename... EpilogueArgs> @@ -39,8 +68,28 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out, using Cutlass3xGemmDefault = typename sm100_fp8_config_default::Cutlass3xGemm; - return cutlass_gemm_caller( - out, a, b, std::forward(args)...); + using Cutlass3xGemmM64 = + typename sm100_fp8_config_M64::Cutlass3xGemm; + using Cutlass3xGemmM128 = + typename sm100_fp8_config_M128::Cutlass3xGemm; + + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(64), next_pow_2(m)); // next power of 2 + + if (mp2 <= 64) { + // m in [1, 64] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 128) { + // m in (64, 128] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else { + // m in (128, inf) + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } } template