diff --git a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml index 3ea0b7bb5cd66f29e4146f6675dd8779f0942d35..4ef8b5c3709b3911e69808681d46f4b3dcbd795f 100644 --- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml +++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml @@ -4,8 +4,8 @@ tasks: - name: "gsm8k" metrics: - name: "exact_match,strict-match" - value: 0.233 + value: 0.231 - name: "exact_match,flexible-extract" - value: 0.236 + value: 0.22 limit: 1000 num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 96e57dfd064758d59ca153473214912071bc2739..4ae23eff62f37eb1cf8c4260bfdf734cd0d707c6 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -13,6 +13,7 @@ from pathlib import Path import lm_eval import numpy +import pytest import yaml RTOL = 0.05 @@ -46,6 +47,10 @@ def test_lm_eval_correctness(): eval_config = yaml.safe_load( Path(TEST_DATA_FILE).read_text(encoding="utf-8")) + if eval_config[ + "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501 + pytest.skip("FBGEMM is currently failing on main.") + # Launch eval requests. results = launch_lm_eval(eval_config) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index e031686c7a293b5c8b86ab2c1ab255e1dbf48f68..1030ec24e8d7fa9fe2742067e33f6d47e2acefda 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -84,8 +84,13 @@ if __name__ == "__main__": # this result is generated via `benchmark_serving.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands")) as f: - command = json.loads(f.read()) + try: + with open(test_file.with_suffix(".commands")) as f: + command = json.loads(f.read()) + except OSError as e: + print(e) + continue + raw_result.update(command) # update the test name of this result @@ -99,8 +104,13 @@ if __name__ == "__main__": # this result is generated via `benchmark_latency.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands")) as f: - command = json.loads(f.read()) + try: + with open(test_file.with_suffix(".commands")) as f: + command = json.loads(f.read()) + except OSError as e: + print(e) + continue + raw_result.update(command) # update the test name of this result @@ -121,8 +131,13 @@ if __name__ == "__main__": # this result is generated via `benchmark_throughput.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands")) as f: - command = json.loads(f.read()) + try: + with open(test_file.with_suffix(".commands")) as f: + command = json.loads(f.read()) + except OSError as e: + print(e) + continue + raw_result.update(command) # update the test name of this result diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 32bd34c431c894ab45ecee456f2f84f4969deee8..4d01a314adc47bdff43db1dcf76cacc822745f6b 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -426,7 +426,7 @@ main() { pip install -U transformers - pip install -r requirements-dev.txt + pip install -r requirements/dev.txt which genai-perf # check storage diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index 9425cb07ec013e69df7afaf2c58c079e6211d140..4cd449b141ece1a1ffb3a14f460acaeca13be395 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -309,11 +309,14 @@ run_serving_tests() { new_test_name=$test_name"_qps_"$qps + # pass the tensor parallel size to the client so that it can be displayed + # on the benchmark dashboard client_command="python3 benchmark_serving.py \ --save-result \ --result-dir $RESULTS_FOLDER \ --result-filename ${new_test_name}.json \ --request-rate $qps \ + --metadata "tensor_parallel_size=$tp" \ $client_args" echo "Running test case $test_name with qps $qps" @@ -358,7 +361,7 @@ main() { # get the current IP address, required by benchmark_serving.py export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') # turn of the reporting of the status of each request, to clean up the terminal output - export VLLM_LOG_LEVEL="WARNING" + export VLLM_LOGGING_LEVEL="WARNING" # prepare for benchmarking cd benchmarks || exit 1 diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json index 91ef6d16be6381576f70beb3d326f29f7d8185b0..9bc87cbcd2bc50fe0d39e75bd6eadffcdea97cd3 100644 --- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json @@ -32,4 +32,4 @@ "backend": "vllm" } } -] \ No newline at end of file +] diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 829414bf8a3bad16f31904be8b2e2782a6c666ca..18f582b6e4c9454eb23a86b8e9ee16750e8a6173 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,4 +1,15 @@ steps: + - label: "Build wheel - CUDA 12.4" + agents: + queue: cpu_queue_postmerge + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/upload-wheels.sh" + env: + DOCKER_BUILDKIT: "1" + - label: "Build wheel - CUDA 12.1" agents: queue: cpu_queue_postmerge @@ -37,7 +48,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - label: "Build and publish TPU release image" @@ -71,7 +82,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ." - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" env: DOCKER_BUILDKIT: "1" diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index f8bf1c87603f4513a4f23816314e254a724338f0..0680bae13ddbfd6928369d0d89f22008f49ae15b 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -77,7 +77,6 @@ echo "Commands:$commands" #ignore certain kernels tests if [[ $commands == *" kernels "* ]]; then commands="${commands} \ - --ignore=kernels/test_attention.py \ --ignore=kernels/test_attention_selector.py \ --ignore=kernels/test_blocksparse_attention.py \ --ignore=kernels/test_causal_conv1d.py \ @@ -92,19 +91,40 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_moe.py \ --ignore=kernels/test_prefix_prefill.py \ --ignore=kernels/test_rand.py \ - --ignore=kernels/test_sampler.py" + --ignore=kernels/test_sampler.py \ + --ignore=kernels/test_cascade_flash_attn.py \ + --ignore=kernels/test_mamba_mixer2.py \ + --ignore=kernels/test_aqlm.py \ + --ignore=kernels/test_machete_mm.py \ + --ignore=kernels/test_mha_attn.py \ + --ignore=kernels/test_block_fp8.py \ + --ignore=kernels/test_permute_cols.py" fi -#ignore certain Entrypoints tests +#ignore certain Entrypoints/openai tests if [[ $commands == *" entrypoints/openai "* ]]; then commands=${commands//" entrypoints/openai "/" entrypoints/openai \ - --ignore=entrypoints/openai/test_accuracy.py \ --ignore=entrypoints/openai/test_audio.py \ - --ignore=entrypoints/openai/test_encoder_decoder.py \ - --ignore=entrypoints/openai/test_embedding.py \ - --ignore=entrypoints/openai/test_oot_registration.py "} + --ignore=entrypoints/openai/test_chat.py \ + --ignore=entrypoints/openai/test_shutdown.py \ + --ignore=entrypoints/openai/test_completion.py \ + --ignore=entrypoints/openai/test_sleep.py \ + --ignore=entrypoints/openai/test_models.py \ + --ignore=entrypoints/openai/test_prompt_validation.py "} fi +#ignore certain Entrypoints/llm tests +if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then + commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "} +fi + +# --ignore=entrypoints/openai/test_encoder_decoder.py \ +# --ignore=entrypoints/openai/test_embedding.py \ +# --ignore=entrypoints/openai/test_oot_registration.py +# --ignore=entrypoints/openai/test_accuracy.py \ +# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13 + + PARALLEL_JOB_COUNT=8 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. if [[ $commands == *"--shard-id="* ]]; then diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 2ead1f51ed81edc436f8b88114a7bc73be13133f..e45e184852f29209f64d5065e82e1e89accd4ec6 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -19,13 +19,14 @@ remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" + --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 + --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 function cpu_tests() { set -e export NUMA_NODE=$2 + export BUILDKITE_BUILD_NUMBER=$3 # offline inference docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " @@ -35,7 +36,8 @@ function cpu_tests() { # Run basic model test docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " set -e - pip install -r vllm/requirements-test.txt + pip install -r vllm/requirements/test.txt + pip install -r vllm/requirements/cpu.txt pytest -v -s tests/models/decoder_only/language -m cpu_model pytest -v -s tests/models/embedding/language -m cpu_model pytest -v -s tests/models/encoder_decoder/language -m cpu_model @@ -85,4 +87,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER" diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 20aca328ba13595c773d8a00f258451c49ea70d9..5c004b47778fba18c27154dba1eee11a1a3f9ae8 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -14,6 +14,7 @@ DOCKER_BUILDKIT=1 docker build . \ -t gh200-test \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ + --build-arg RUN_WHEEL_CHECK=false \ --build-arg torch_cuda_arch_list="9.0+PTX" \ --build-arg vllm_fa_cmake_gpu_arches="90-real" @@ -23,6 +24,6 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and test offline inference -docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' +docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B ' diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 55c374fcc33deeeb499b48d322b25e84cdcae55a..ad5ae6f41574856893ff22154e84be4d3864414c 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -44,11 +44,11 @@ remove_docker_container() { trap remove_docker_container EXIT # Run the image -docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ +docker run --rm -it --device=/dev/neuron0 --network bridge \ -v "${HF_CACHE}:${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \ -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys" + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys" diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh deleted file mode 100755 index a1103bed66ecbb1974b020d4fd28e6bb44663caa..0000000000000000000000000000000000000000 --- a/.buildkite/run-openvino-test.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -# This script build the OpenVINO docker image and run the offline inference inside the container. -# It serves a sanity check for compilation and basic model usage. -set -ex - -# Try building the docker image -docker build -t openvino-test -f Dockerfile.openvino . - -# Setup cleanup -remove_docker_container() { docker rm -f openvino-test || true; } -trap remove_docker_container EXIT -remove_docker_container - -# Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-v1-test.sh similarity index 52% rename from .buildkite/run-tpu-test.sh rename to .buildkite/run-tpu-v1-test.sh index 650af0fac4c61ea47547a17eb5c31213f2688fcf..f0f53d3b716d78fcc3e239b758ad8dff0c75fb99 100755 --- a/.buildkite/run-tpu-test.sh +++ b/.buildkite/run-tpu-v1-test.sh @@ -19,8 +19,20 @@ docker run --privileged --net host --shm-size=16G -it \ vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ && python3 -m pip install pytest \ && python3 -m pip install lm_eval[api]==0.4.4 \ - && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \ - && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ - && python3 /workspace/vllm/tests/tpu/test_compilation.py \ - && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ - && python3 /workspace/vllm/examples/offline_inference/tpu.py" + && export VLLM_USE_V1=1 \ + && export VLLM_XLA_CHECK_RECOMPILATION=1 \ + && echo TEST_1 \ + && pytest /workspace/vllm/tests/tpu/test_compilation.py \ + && echo TEST_2 \ + && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \ + && echo TEST_3 \ + && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \ + && echo TEST_4 \ + && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ + && echo TEST_5 \ + && python3 /workspace/vllm/examples/offline_inference/tpu.py" \ + + +# TODO: This test fails because it uses RANDOM_SEED sampling +# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ + diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index d48639e5720c50fb6782d85d8652ce932abf5f7a..3a0e6bdb2caaf5e5e655671d696d47ef9da63473 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -4,16 +4,28 @@ # It serves a sanity check for compilation and basic model usage. set -ex +image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}" +container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" + # Try building the docker image -docker build -t xpu-test -f Dockerfile.xpu . +docker build -t ${image_name} -f Dockerfile.xpu . # Setup cleanup -remove_docker_container() { docker rm -f xpu-test || true; } +remove_docker_container() { + docker rm -f "${container_name}" || true; + docker image rm -f "${image_name}" || true; + docker system prune -f || true; +} trap remove_docker_container EXIT -remove_docker_container # Run the image and test offline inference/tensor parallel -docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2 +docker run \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + --entrypoint="" \ + --name "${container_name}" \ + "${image_name}" \ + sh -c ' + VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m + VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2 ' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 66efe3ed32986cd6fe1cf750e105adddb2b11fa9..217f869f1f3c5e7f1c77f9c37823fdf9457ca8b8 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -35,13 +35,12 @@ steps: fast_check: true no_gpu: True commands: - - pip install -r requirements-docs.txt + - pip install -r ../../requirements/docs.txt - SPHINXOPTS=\"-W\" make html # Check API reference (if it fails, you may have missing mock imports) - grep \"sig sig-object py\" build/html/api/inference_params.html - label: Async Engine, Inputs, Utils, Worker Test # 24min - fast_check: true source_file_dependencies: - vllm/ - tests/mq_llm_engine @@ -78,6 +77,7 @@ steps: - tests/basic_correctness/test_preemption - tests/basic_correctness/test_cumem.py commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py @@ -112,19 +112,19 @@ steps: - tests/entrypoints/test_chat_utils - tests/entrypoints/offline_mode commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/ + - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ - pytest -v -s entrypoints/test_chat_utils.py - - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Distributed Tests (4 GPUs) # 10min working_dir: "/vllm-workspace/tests" num_gpus: 4 - fast_check: true source_file_dependencies: - vllm/distributed/ - vllm/core/ @@ -134,19 +134,26 @@ steps: - tests/compile/test_basic_correctness - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py + - tests/examples/offline_inference/data_parallel.py commands: + # test with tp=2 and external_dp=2 + - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with internal dp + - python3 ../examples/offline_inference/data_parallel.py - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests - - python3 ../examples/offline_inference/rlhf.py - - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py + - pushd ../examples/offline_inference + - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py + - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd - label: Metrics, Tracing Test # 10min num_gpus: 2 - fast_check: true source_file_dependencies: - vllm/ - tests/metrics @@ -194,15 +201,19 @@ steps: - tests/v1 commands: # split the test to avoid interference - - VLLM_USE_V1=1 pytest -v -s v1/core - - VLLM_USE_V1=1 pytest -v -s v1/engine - - VLLM_USE_V1=1 pytest -v -s v1/sample - - VLLM_USE_V1=1 pytest -v -s v1/worker - - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py - - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py + - pytest -v -s v1/core + - pytest -v -s v1/entrypoints + - pytest -v -s v1/engine + - pytest -v -s v1/entrypoints + - pytest -v -s v1/sample + - pytest -v -s v1/worker + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_stats.py + - pytest -v -s v1/test_utils.py + - pytest -v -s v1/test_oracle.py # TODO: accuracy does not match, whether setting # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - - VLLM_USE_V1=1 pytest -v -s v1/e2e + - pytest -v -s v1/e2e # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine @@ -220,14 +231,17 @@ steps: - python3 offline_inference/basic/chat.py - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/vision_language.py - - python3 offline_inference/vision_language_multi_image.py - - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_embedding.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder.py + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 + - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min mirror_hardwares: [amd] @@ -273,11 +287,10 @@ steps: source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py parallelism: 4 -- label: "PyTorch Fullgraph Smoke Test" # 9min - fast_check: true +- label: PyTorch Fullgraph Smoke Test # 9min source_file_dependencies: - vllm/ - tests/compile @@ -286,8 +299,9 @@ steps: # these tests need to be separated, cannot combine - pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_toy_llama.py + - pytest -v -s compile/test_pass_manager.py -- label: "PyTorch Fullgraph Test" # 18min +- label: PyTorch Fullgraph Test # 18min source_file_dependencies: - vllm/ - tests/compile @@ -372,7 +386,8 @@ steps: commands: - pytest -v -s models/test_transformers.py - pytest -v -s models/test_registry.py - - pytest -v -s models/test_initialization.py + # V1 Test: https://github.com/vllm-project/vllm/issues/14531 + - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py - label: Language Models Test (Standard) # 32min #mirror_hardwares: [amd] @@ -500,8 +515,7 @@ steps: - vllm/worker/model_runner.py - entrypoints/llm/test_collective_rpc.py commands: - - pytest -v -s entrypoints/llm/test_collective_rpc.py - - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py + - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' @@ -514,13 +528,12 @@ steps: # this test fails consistently. # TODO: investigate and fix # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py + - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - label: Plugin Tests (2 GPUs) # 40min working_dir: "/vllm-workspace/tests" num_gpus: 2 - fast_check: true source_file_dependencies: - vllm/plugins/ - tests/plugins/ @@ -586,6 +599,7 @@ steps: - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_minicpmv_tp.py + - pytest -v -s -x lora/test_transfomers_model.py - label: Weight Loading Multiple GPU Test # 33min diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh index 3c756659a715aa593c88e0615b0b18756fe33e5b..a681f892706002add0d74b8c7588637bc54b0786 100644 --- a/.buildkite/upload-wheels.sh +++ b/.buildkite/upload-wheels.sh @@ -50,8 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" if [[ $normal_wheel == *"cu118"* ]]; then # if $normal_wheel matches cu118, do not upload the index.html echo "Skipping index files for cu118 wheels" +elif [[ $normal_wheel == *"cu121"* ]]; then + # if $normal_wheel matches cu121, do not upload the index.html + echo "Skipping index files for cu121 wheels" else - # only upload index.html for cu12 wheels (default wheels) + # only upload index.html for cu124 wheels (default wheels) aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" fi @@ -63,8 +66,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" if [[ $normal_wheel == *"cu118"* ]]; then # if $normal_wheel matches cu118, do not upload the index.html echo "Skipping index files for cu118 wheels" +elif [[ $normal_wheel == *"cu121"* ]]; then + # if $normal_wheel matches cu121, do not upload the index.html + echo "Skipping index files for cu121 wheels" else - # only upload index.html for cu12 wheels (default wheels) + # only upload index.html for cu124 wheels (default wheels) aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" fi diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index bc324d8b988b1a9c5f76b8fb34deaa83b3c7880e..860c5c6cd53744f1de3c0c73983b91cf94f30fa8 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -10,27 +10,32 @@ /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth -/vllm/model_executor/guided_decoding @mgoin +/vllm/model_executor/guided_decoding @mgoin @russellb /vllm/multimodal @DarkLight1337 @ywang96 CMakeLists.txt @tlrmchlsmth # vLLM V1 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat +/vllm/v1/structured_output @mgoin @russellb # Test ownership +/.buildkite/lm-eval-harness @mgoin @simon-mo /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo -/tests/test_inputs.py @DarkLight1337 @ywang96 +/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac +/tests/distributed/test_multi_node_assignment.py @youkaichao +/tests/distributed/test_pipeline_parallel.py @youkaichao +/tests/distributed/test_same_node.py @youkaichao /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo +/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb +/tests/kernels @tlrmchlsmth @WoosukKwon +/tests/model_executor/test_guided_processors.py @mgoin @russellb /tests/models @DarkLight1337 @ywang96 +/tests/multi_step @alexm-redhat @comaniac /tests/multimodal @DarkLight1337 @ywang96 /tests/prefix_caching @comaniac @KuntaiDu -/tests/spec_decode @njhill @LiuXiaoxuanPKU -/tests/kernels @tlrmchlsmth @WoosukKwon /tests/quantization @mgoin @robertgshaw2-redhat -/.buildkite/lm-eval-harness @mgoin @simon-mo -/tests/distributed/test_multi_node_assignment.py @youkaichao -/tests/distributed/test_pipeline_parallel.py @youkaichao -/tests/distributed/test_same_node.py @youkaichao -/tests/multi_step @alexm-redhat @comaniac +/tests/spec_decode @njhill @LiuXiaoxuanPKU +/tests/test_inputs.py @DarkLight1337 @ywang96 +/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb +/tests/v1/structured_output @mgoin @russellb /tests/weight_loading @mgoin @youkaichao -/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac diff --git a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml deleted file mode 100644 index 79e6e9080d51cc513a7a41ee7bf7a1d8baf2dad0..0000000000000000000000000000000000000000 --- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: 🎲 Misc/random discussions that do not fit into the above categories. -description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues. -title: "[Misc]: " -labels: ["misc"] - -body: -- type: markdown - attributes: - value: > - #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). -- type: textarea - attributes: - label: Anything you want to discuss about vllm. - description: > - Anything you want to discuss about vllm. - validations: - required: true -- type: markdown - attributes: - value: > - Thanks for contributing 🎉! -- type: checkboxes - id: askllm - attributes: - label: Before submitting a new issue... - options: - - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. - required: true diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 3ba13e0cec6cbbfd462e9ebf529dd2093148cd69..fa40268d677279166477c072c4d950e800e2d100 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1 +1,5 @@ blank_issues_enabled: false +contact_links: + - name: Questions + url: https://discuss.vllm.ai + about: Ask questions and discuss with other vLLM community members diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 683b70cd89989f9a91ada2749524c87bf455bf25..a017d69be9910d33415cca6447f4c84a5f17aea2 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -23,7 +23,7 @@ updates: - dependency-name: "lm-format-enforcer" - dependency-name: "gguf" - dependency-name: "compressed-tensors" - - dependency-name: "ray[adag]" + - dependency-name: "ray[cgraph]" # Ray Compiled Graph - dependency-name: "lm-eval" groups: minor-update: diff --git a/.github/mergify.yml b/.github/mergify.yml index 43bc5ce623d3cf08b78d4617925d50f39983741e..54f56210b286ac1fc593bf76fe950aebcc81efb2 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -5,6 +5,7 @@ pull_request_rules: - or: - files~=^[^/]+\.md$ - files~=^docs/ + - files~=^examples/ actions: label: add: @@ -35,6 +36,21 @@ pull_request_rules: add: - frontend +- name: label-multi-modality + description: Automatically apply multi-modality label + conditions: + - or: + - files~=^vllm/multimodal/ + - files~=^tests/multimodal/ + - files~=^tests/models/multimodal/ + - files~=^tests/models/*/audio_language/ + - files~=^tests/models/*/vision_language/ + - files=tests/models/test_vision.py + actions: + label: + add: + - multi-modality + - name: label-structured-output description: Automatically apply structured-output label conditions: diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index a4e9acc414d4480504e7acb44993a4d34115591d..b199d0867a648188e19d65b2e94deb28ec99b3bf 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -12,7 +12,7 @@ jobs: fetch-depth: 0 - name: Set up Helm - uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0 + uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0 with: version: v3.14.4 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index e40ceaaa8b037788f8cc0f26863bdda5230af3f1..bfd02879965eee1fb1eead062edd21f53798e14f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -39,7 +39,7 @@ jobs: const script = require('.github/workflows/scripts/create_release.js') await script(github, context, core) - # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. + # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. # wheel: # name: Build Wheel # runs-on: ${{ matrix.os }} @@ -50,7 +50,7 @@ jobs: # matrix: # os: ['ubuntu-20.04'] # python-version: ['3.9', '3.10', '3.11', '3.12'] - # pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt. + # pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt. # cuda-version: ['11.8', '12.1'] # steps: diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index 122e4e101e2011898335d8863f5bfae403ff2792..0f010832b465d25f376102dffef8f5fbd2735c7e 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH # Install requirements -$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt +$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt # Limit the number of parallel jobs to avoid OOM export MAX_JOBS=1 diff --git a/.github/workflows/scripts/create_release.js b/.github/workflows/scripts/create_release.js index 475742118afeb9301e977ffa5deb8e6d908819f7..0feb5dc2cf84b9832c749f47e2a3658f42df3b64 100644 --- a/.github/workflows/scripts/create_release.js +++ b/.github/workflows/scripts/create_release.js @@ -1,4 +1,4 @@ -// Uses Github's API to create the release and wait for result. +// Uses GitHub's API to create the release and wait for result. // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately. module.exports = async (github, context, core) => { diff --git a/.gitignore b/.gitignore index 89dab8f13bab194a9414cd6f1142b7ce05326d70..6f5cbd0733da04ed1d6137892a0c632f8341194b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,8 @@ /vllm/_version.py # vllm-flash-attn built from source -vllm/vllm_flash_attn/ +vllm/vllm_flash_attn/* +!vllm/vllm_flash_attn/fa_utils.py # Byte-compiled / optimized / DLL files __pycache__/ @@ -197,7 +198,7 @@ _build/ hip_compat.h # Benchmark dataset -benchmarks/*.json +benchmarks/**/*.json # Linting actionlint diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b1967065c09b2bf3b53a9b96a6645000ec0f77a7..484cd171f5f52b768d40099accff1541b590d1a6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,7 @@ default_stages: - pre-commit # Run locally - manual # Run in CI +exclude: 'vllm/third_party/.*' repos: - repo: https://github.com/google/yapf rev: v0.43.0 @@ -8,13 +9,11 @@ repos: - id: yapf args: [--in-place, --verbose] additional_dependencies: [toml] # TODO: Remove when yapf is upgraded - exclude: 'vllm/third_party/.*' - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.9.3 hooks: - id: ruff args: [--output-format, github, --fix] - exclude: 'vllm/third_party/.*' - repo: https://github.com/codespell-project/codespell rev: v2.4.0 hooks: @@ -22,10 +21,9 @@ repos: additional_dependencies: ['tomli'] args: ['--toml', 'pyproject.toml'] - repo: https://github.com/PyCQA/isort - rev: 5.13.2 + rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0 hooks: - id: isort - exclude: 'vllm/third_party/.*' - repo: https://github.com/pre-commit/mirrors-clang-format rev: v19.1.7 hooks: @@ -38,12 +36,16 @@ repos: hooks: - id: pymarkdown args: [fix] - exclude: 'vllm/third_party/.*' - repo: https://github.com/rhysd/actionlint rev: v1.7.7 hooks: - id: actionlint - exclude: 'vllm/third_party/.*' +- repo: https://github.com/astral-sh/uv-pre-commit + rev: 0.6.2 + hooks: + - id: pip-compile + args: [requirements/test.in, -o, requirements/test.txt] + files: ^requirements/test\.(in|txt)$ - repo: local hooks: - id: mypy-local @@ -51,9 +53,8 @@ repos: entry: tools/mypy.sh 0 "local" language: python types: [python] - additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests] + additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests] stages: [pre-commit] # Don't run in CI - exclude: 'vllm/third_party/.*' - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.9 entry: tools/mypy.sh 1 "3.9" @@ -61,7 +62,6 @@ repos: types: [python] additional_dependencies: *mypy_deps stages: [manual] # Only run in CI - exclude: 'vllm/third_party/.*' - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.10 entry: tools/mypy.sh 1 "3.10" @@ -69,7 +69,6 @@ repos: types: [python] additional_dependencies: *mypy_deps stages: [manual] # Only run in CI - exclude: 'vllm/third_party/.*' - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.11 entry: tools/mypy.sh 1 "3.11" @@ -77,7 +76,6 @@ repos: types: [python] additional_dependencies: *mypy_deps stages: [manual] # Only run in CI - exclude: 'vllm/third_party/.*' - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.12 entry: tools/mypy.sh 1 "3.12" @@ -85,19 +83,16 @@ repos: types: [python] additional_dependencies: *mypy_deps stages: [manual] # Only run in CI - exclude: 'vllm/third_party/.*' - id: shellcheck name: Lint shell scripts entry: tools/shellcheck.sh language: script types: [shell] - exclude: 'vllm/third_party/.*' - id: png-lint name: Lint PNG exports from excalidraw entry: tools/png-lint.sh language: script types: [png] - exclude: 'vllm/third_party/.*' - id: signoff-commit name: Sign-off Commit entry: bash @@ -110,13 +105,11 @@ repos: language: system verbose: true stages: [commit-msg] - exclude: 'vllm/third_party/.*' - id: check-spdx-header name: Check SPDX headers entry: python tools/check_spdx_header.py language: python types: [python] - exclude: 'vllm/third_party/.*' - id: check-filenames name: Check for spaces in all filenames entry: bash @@ -126,7 +119,6 @@ repos: language: system always_run: true pass_filenames: false - exclude: 'vllm/third_party/.*' # Keep `suggestion` last - id: suggestion name: Suggestion @@ -134,5 +126,4 @@ repos: language: system verbose: true pass_filenames: false - exclude: 'vllm/third_party/.*' # Insert new entries above the `suggestion` entry diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 284196bc2d2797b6157173fd9ff9f254d103a43b..2781ec223b665d1fde3d503c9a4608051400c1c4 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -18,4 +18,4 @@ formats: [] # Optionally declare the Python requirements required to build your docs python: install: - - requirements: docs/requirements-docs.txt + - requirements: requirements/docs.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index ef9ac6b6f29c4ff1ba813141acbf36016c8b1755..4a5d1aa297654040981332f0820fd84b2b118537 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,7 +35,7 @@ set(ignoreMe "${VLLM_PYTHON_PATH}") set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") # Supported NVIDIA architectures. -set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0") +set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") # Supported hcu architectures. set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx906;gfx926;gfx928;gfx936") @@ -50,8 +50,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1") -set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1") +set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0") +set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0") # # Try to find python package with an executable that exactly matches @@ -178,6 +178,25 @@ include(FetchContent) file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") +# +# Set rocm version dev int. +# +if(VLLM_GPU_LANG STREQUAL "HIP") + # + # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info + # + set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3") + + + # + # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates + # a lot of warnings that always mask real issues. Suppressing until this is properly addressed. + # + set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result") +endif() + # # Define other extension targets # @@ -242,7 +261,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case. # Please keep this in sync with FetchContent_Declare line below. - set(CUTLASS_REVISION "v3.7.0" CACHE STRING "CUTLASS revision to use") + set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) @@ -260,7 +279,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cutlass GIT_REPOSITORY https://github.com/nvidia/cutlass.git # Please keep this in sync with CUTLASS_REVISION line above. - GIT_TAG v3.7.0 + GIT_TAG v3.8.0 GIT_PROGRESS TRUE # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. @@ -280,6 +299,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/permute_cols.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" "csrc/quantization/fp4/nvfp4_quant_entry.cu" + "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu" "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" "csrc/cutlass_extensions/common.cpp") @@ -290,7 +310,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Only build Marlin kernels if we are building for at least some compatible archs. # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") if (MARLIN_ARCHS) set(MARLIN_SRCS "csrc/quantization/fp8/fp8_marlin.cu" @@ -310,43 +330,87 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") " in CUDA target architectures") endif() + # Only build AllSpark kernels if we are building for at least some compatible archs. + cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}") + if (ALLSPARK_ARCHS) + set(ALLSPARK_SRCS + "csrc/quantization/gptq_allspark/allspark_repack.cu" + "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu") + set_gencode_flags_for_srcs( + SRCS "${ALLSPARK_SRCS}" + CUDA_ARCHS "${ALLSPARK_ARCHS}") + list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}") + message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}") + else() + message(STATUS "Not building AllSpark kernels as no compatible archs found" + " in CUDA target architectures") + endif() + + + set(SCALED_MM_3X_ARCHS) # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require - # CUDA 12.0 or later (and only work on Hopper, 9.0a for now). - cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) - set(SRCS - "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" + # CUDA 12.0 or later + cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS) + set(SRCS + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") + CUDA_ARCHS "${SCALED_MM_ARCHS}") list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1") - message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1") + # Let scaled_mm_c2x know it doesn't need to build these arches + list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") + message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) - message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is " + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS) + message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is " "not >= 12.0, we recommend upgrading to CUDA 12.0 or " "later if you intend on running FP8 quantized models on " "Hopper.") else() - message(STATUS "Not building scaled_mm_c3x as no compatible archs found " + message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found " "in CUDA target architectures") endif() + endif() - # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't - # build any 3x kernels - set(SCALED_MM_3X_ARCHS) + # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require + # CUDA 12.8 or later + cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) + set(SRCS + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" + "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" + ) + set_gencode_flags_for_srcs( + SRCS "${SRCS}" + CUDA_ARCHS "${SCALED_MM_ARCHS}") + list(APPEND VLLM_EXT_SRC "${SRCS}") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1") + # Let scaled_mm_c2x know it doesn't need to build these arches + list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}") + message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}") + else() + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS) + message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is " + "not >= 12.8, we recommend upgrading to CUDA 12.8 or " + "later if you intend on running FP8 quantized models on " + "Blackwell.") + else() + message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found " + "in CUDA target architectures") + endif() endif() # # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS - "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") + "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) if (SCALED_MM_2X_ARCHS) @@ -371,17 +435,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # 2:4 Sparse Kernels # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor - # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now). - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS) + # require CUDA 12.2 or later (and only work on Hopper). + cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_3X_ARCHS}") + CUDA_ARCHS "${SCALED_MM_ARCHS}") list(APPEND VLLM_EXT_SRC "${SRCS}") list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1") - message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}") + message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS) + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS) message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is " "not >= 12.2, we recommend upgrading to CUDA 12.2 or later " "if you intend on running FP8 sparse quantized models on Hopper.") @@ -394,9 +459,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # FP4 Archs and flags cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS) - set(SRCS + set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" - ) + "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${FP4_ARCHS}") @@ -481,6 +546,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # if CUDA endif endif() +if(VLLM_GPU_LANG STREQUAL "HIP") + list(APPEND VLLM_EXT_SRC + "csrc/custom_all_reduce.cu") +endif() + message(STATUS "Enabling C extension.") define_gpu_extension_target( _C @@ -490,6 +560,7 @@ define_gpu_extension_target( COMPILE_FLAGS ${VLLM_GPU_FLAGS} ARCHITECTURES ${VLLM_GPU_ARCHES} INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR} + INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR} USE_SABI 3 WITH_SOABI) @@ -508,12 +579,24 @@ set(VLLM_MOE_EXT_SRC "csrc/moe/moe_align_sum_kernels.cu" "csrc/moe/topk_softmax_kernels.cu") +if(VLLM_GPU_LANG STREQUAL "CUDA") + list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu") +endif() + set_gencode_flags_for_srcs( SRCS "${VLLM_MOE_EXT_SRC}" CUDA_ARCHS "${CUDA_ARCHS}") if(VLLM_GPU_LANG STREQUAL "CUDA") - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") + set(VLLM_MOE_WNA16_SRC + "csrc/moe/moe_wna16.cu") + + set_gencode_flags_for_srcs( + SRCS "${VLLM_MOE_WNA16_SRC}" + CUDA_ARCHS "${CUDA_ARCHS}") + + list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") if (MARLIN_MOE_ARCHS) set(MARLIN_MOE_SRC "csrc/moe/marlin_kernels/marlin_moe_kernel.h" @@ -569,81 +652,8 @@ if(VLLM_GPU_LANG STREQUAL "HIP") endif() ]] -# vllm-flash-attn currently only supported on CUDA -if (NOT VLLM_GPU_LANG STREQUAL "CUDA") - return() +# For CUDA we also build and ship some external projects. +if (VLLM_GPU_LANG STREQUAL "CUDA") + include(cmake/external_projects/flashmla.cmake) + include(cmake/external_projects/vllm_flash_attn.cmake) endif () - -# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target -# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the -# arches in the CUDA case (and instead set the gencodes on a per file basis) -# we need to manually set VLLM_GPU_ARCHES here. -if(VLLM_GPU_LANG STREQUAL "CUDA") - foreach(_ARCH ${CUDA_ARCHS}) - string(REPLACE "." "" _ARCH "${_ARCH}") - list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real") - endforeach() -endif() - -# -# Build vLLM flash attention from source -# -# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM. -# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs. -# They should be identical but if they aren't, this is a massive footgun. -# -# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place. -# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3). -# If no component is specified, vllm-flash-attn is still installed. - -# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading. -# This is to enable local development of vllm-flash-attn within vLLM. -# It can be set as an environment variable or passed as a cmake argument. -# The environment variable takes precedence. -if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR}) - set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR}) -endif() - -if(VLLM_FLASH_ATTN_SRC_DIR) - FetchContent_Declare( - vllm-flash-attn SOURCE_DIR - ${VLLM_FLASH_ATTN_SRC_DIR} - BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn - ) -#[[ -else() - FetchContent_Declare( - vllm-flash-attn - GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade - GIT_PROGRESS TRUE - # Don't share the vllm-flash-attn build between build types - BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn - ) -]] -endif() - - -#[[ -# Fetch the vllm-flash-attn library -FetchContent_MakeAvailable(vllm-flash-attn) -message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") - -# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in -# case only one is built, in the case both are built redundant work is done) -install( - DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm_flash_attn - COMPONENT _vllm_fa2_C - FILES_MATCHING PATTERN "*.py" -) - -install( - DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm_flash_attn - COMPONENT _vllm_fa3_C - FILES_MATCHING PATTERN "*.py" -) - -# Nothing after vllm-flash-attn, see comment about macros above -]] \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 310e003d427dae44ac6a1980a833a814bebc376d..d1ecef586d50bc70ccbaa34a1cd2344f07c587a9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,9 +28,13 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && python3 --version && python3 -m pip --version # Install uv for faster pip installs -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ python3 -m pip install uv +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # as it was causing spam when compiling the CUTLASS kernels RUN apt-get install -y gcc-10 g++-10 @@ -53,15 +57,16 @@ WORKDIR /workspace # we need to install torch and torchvision from the nightly builds first, # pytorch will not appear as a vLLM dependency in all of the following steps # after this step -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \ + uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \ + uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \ fi -COPY requirements-common.txt requirements-common.txt -COPY requirements-cuda.txt requirements-cuda.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - uv pip install --system -r requirements-cuda.txt +COPY requirements/common.txt requirements/common.txt +COPY requirements/cuda.txt requirements/cuda.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/cuda.txt # cuda arch list used by torch # can be useful for both `dev` and `test` @@ -79,15 +84,19 @@ FROM base AS build ARG TARGETPLATFORM # install build dependencies -COPY requirements-build.txt requirements-build.txt +COPY requirements/build.txt requirements/build.txt + +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 -RUN --mount=type=cache,target=/root/.cache/pip \ - uv pip install --system -r requirements-build.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/build.txt COPY . . ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi + if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi # max jobs used by Ninja to build extensions ARG max_jobs=2 @@ -101,7 +110,7 @@ ARG SCCACHE_BUCKET_NAME=vllm-build-sccache ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_S3_NO_CREDENTIALS=0 # if USE_SCCACHE is set, use sccache to speed up compilation -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=.git,target=.git \ if [ "$USE_SCCACHE" = "1" ]; then \ echo "Installing sccache..." \ @@ -121,9 +130,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=.git,target=.git \ if [ "$USE_SCCACHE" != "1" ]; then \ + # Clean any existing CMake artifacts + rm -rf .deps && \ + mkdir -p .deps && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi @@ -143,11 +155,15 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ #################### DEV IMAGE #################### FROM base as dev -COPY requirements-lint.txt requirements-lint.txt -COPY requirements-test.txt requirements-test.txt -COPY requirements-dev.txt requirements-dev.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - uv pip install --system -r requirements-dev.txt +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + +COPY requirements/lint.txt requirements/lint.txt +COPY requirements/test.txt requirements/test.txt +COPY requirements/dev.txt requirements/dev.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/dev.txt #################### DEV IMAGE #################### #################### vLLM installation IMAGE #################### @@ -178,9 +194,13 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && python3 --version && python3 -m pip --version # Install uv for faster pip installs -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ python3 -m pip install uv +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + # Workaround for https://github.com/openai/triton/issues/2507 and # https://github.com/pytorch/pytorch/issues/107960 -- hopefully # this won't be needed for future versions of this docker image @@ -191,14 +211,15 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ # we need to install torch and torchvision from the nightly builds first, # pytorch will not appear as a vLLM dependency in all of the following steps # after this step -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \ + uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319"; \ + uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \ fi # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ - --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ uv pip install --system dist/*.whl --verbose # If we need to build FlashInfer wheel before its release: @@ -213,10 +234,10 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # $ ls dist # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \ + uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ fi COPY examples examples @@ -224,9 +245,9 @@ COPY examples examples # some issues w.r.t. JIT compilation. Therefore we need to # install build dependencies for JIT compilation. # TODO: Remove this once FlashInfer AOT wheel is fixed -COPY requirements-build.txt requirements-build.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - uv pip install --system -r requirements-build.txt +COPY requirements/build.txt requirements/build.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/build.txt #################### vLLM installation IMAGE #################### @@ -237,16 +258,20 @@ FROM vllm-base AS test ADD . /vllm-workspace/ +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + # install development dependencies (for testing) -RUN --mount=type=cache,target=/root/.cache/pip \ - uv pip install --system -r requirements-dev.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/dev.txt # install development dependencies (for testing) -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -e tests/vllm_test_utils # enable fast downloads from hf (for testing) -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system hf_transfer ENV HF_HUB_ENABLE_HF_TRANSFER 1 @@ -265,12 +290,16 @@ RUN mv vllm test_docs/ # base openai image with additional requirements, for any subsequent openai-style images FROM vllm-base AS vllm-openai-base +# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out +# Reference: https://github.com/astral-sh/uv/pull/1694 +ENV UV_HTTP_TIMEOUT=500 + # install additional dependencies for openai api server -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ else \ - uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ + uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \ fi ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/Dockerfile.arm b/Dockerfile.arm index 093ee2209222f775df3e6faf2ed4c9efd386b89d..bad093684239c400ae8c598bd413e2759a0bf6eb 100644 --- a/Dockerfile.arm +++ b/Dockerfile.arm @@ -26,18 +26,18 @@ WORKDIR /workspace ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ + --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \ pip install --upgrade pip && \ - pip install -r requirements-build.txt + pip install -r requirements/build.txt FROM cpu-test-arm AS build WORKDIR /workspace/vllm RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ - --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ - pip install -v -r requirements-cpu.txt + --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \ + --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \ + pip install -v -r requirements/cpu.txt COPY . . ARG GIT_REPO_CHECK=0 diff --git a/Dockerfile.cpu b/Dockerfile.cpu index ebe226cf6d148fa7651f7988dc4c8c66d0940d5d..a10090529d8a98ecd51979f2b92e728c34ce4560 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -22,25 +22,25 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li RUN echo 'ulimit -c 0' >> ~/.bashrc -RUN pip install intel_extension_for_pytorch==2.5.0 +RUN pip install intel_extension_for_pytorch==2.6.0 WORKDIR /workspace ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ + --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \ pip install --upgrade pip && \ - pip install -r requirements-build.txt + pip install -r requirements/build.txt FROM cpu-test-1 AS build WORKDIR /workspace/vllm RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ - --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ - pip install -v -r requirements-cpu.txt + --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \ + --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \ + pip install -v -r requirements/cpu.txt COPY . . ARG GIT_REPO_CHECK=0 diff --git a/Dockerfile.hpu b/Dockerfile.hpu index 66cf68c32f2cacd036402217ba38cd3db3e1543b..48211c88f872bc331b404784cdbddc2429bee6ba 100644 --- a/Dockerfile.hpu +++ b/Dockerfile.hpu @@ -4,7 +4,7 @@ COPY ./ /workspace/vllm WORKDIR /workspace/vllm -RUN pip install -v -r requirements-hpu.txt +RUN pip install -v -r requirements/hpu.txt ENV no_proxy=localhost,127.0.0.1 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 27658d836d988fe4bf17ca9e6f546acd0704213e..067645906366e42696fc4efdfd92d788f04d10da 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -36,7 +36,7 @@ RUN --mount=type=bind,source=.git,target=.git \ RUN python3 -m pip install -U \ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ - -r requirements-neuron.txt + -r requirements/neuron.txt ENV VLLM_TARGET_DEVICE neuron RUN --mount=type=bind,source=.git,target=.git \ diff --git a/Dockerfile.openvino b/Dockerfile.openvino deleted file mode 100644 index 32bcbfa9cc16812118c6451ff2219d630126599d..0000000000000000000000000000000000000000 --- a/Dockerfile.openvino +++ /dev/null @@ -1,29 +0,0 @@ -# The vLLM Dockerfile is used to construct vLLM image that can be directly used -# to run the OpenAI compatible server. - -FROM ubuntu:22.04 AS dev - -RUN apt-get update -y && \ - apt-get install -y \ - git python3-pip \ - ffmpeg libsm6 libxext6 libgl1 -WORKDIR /workspace - -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi - -RUN python3 -m pip install -U pip -# install build requirements -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt -# build vLLM with OpenVINO backend -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace - -COPY examples/ /workspace/examples -COPY benchmarks/ /workspace/benchmarks - -# install development dependencies (for testing) -RUN python3 -m pip install -e tests/vllm_test_utils - -CMD ["/bin/bash"] diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index c4c1f3e357972b143c0326f64ceb933c6daf9362..c5ca20d76e3e062c3bfbd7ffc81ed7e9b9fa1ae3 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -6,7 +6,7 @@ ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/" RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev -# Some packages in requirements-cpu are installed here +# Some packages in requirements/cpu are installed here # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba # Currently these may not be available for venv or pip directly RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes @@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \ RUN --mount=type=cache,target=/root/.cache/pip \ RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ - -r requirements-cpu.txt \ + -r requirements/cpu.txt \ xformers uvloop==0.20.0 RUN --mount=type=bind,source=.git,target=.git \ diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 14c522afd7f9e9ef8a1161dde0a22a3ef9a15466..841e7978a424f331af06ebf793c70598bc9d3f44 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -38,14 +38,14 @@ FROM fetch_vllm AS build_vllm ARG USE_CYTHON # Build vLLM RUN cd vllm \ - && python3 -m pip install -r requirements-rocm.txt \ + && python3 -m pip install -r requirements/rocm.txt \ && python3 setup.py clean --all \ - && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \ + && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \ && python3 setup.py bdist_wheel --dist-dir=dist FROM scratch AS export_vllm ARG COMMON_WORKDIR COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl / -COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt / +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples @@ -60,7 +60,8 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* # Install vLLM RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ cd /install \ - && pip install -U -r requirements-rocm.txt \ + && pip install -U -r requirements/rocm.txt \ + && pip install -U -r requirements/rocm-test.txt \ && pip uninstall -y vllm \ && pip install *.whl @@ -99,7 +100,7 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \ # Install vLLM RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ cd /install \ - && pip install -U -r requirements-rocm.txt \ + && pip install -U -r requirements/rocm.txt \ && pip uninstall -y vllm \ && pip install *.whl diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base index e33e73b303098fed3929cf779034b6884c36deba..38d6a33636eba57ecf97b3cbd5ea0daacb3eaa55 100644 --- a/Dockerfile.rocm_base +++ b/Dockerfile.rocm_base @@ -12,6 +12,8 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG FA_BRANCH="b7d29fb" ARG FA_REPO="https://github.com/ROCm/flash-attention.git" +ARG AITER_BRANCH="21d47a9" +ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base @@ -129,8 +131,18 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ pip install /install/*.whl +ARG AITER_REPO +ARG AITER_BRANCH +RUN git clone --recursive ${AITER_REPO} +RUN cd aiter \ + && git checkout ${AITER_BRANCH} \ + && git submodule update --init --recursive \ + && pip install -r requirements.txt \ + && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter + ARG BASE_IMAGE ARG HIPBLASLT_BRANCH +ARG HIPBLAS_COMMON_BRANCH ARG LEGACY_HIPBLASLT_OPTION ARG RCCL_BRANCH ARG RCCL_REPO @@ -155,4 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ - && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt + && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ + && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ + && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt diff --git a/Dockerfile.s390x b/Dockerfile.s390x new file mode 100644 index 0000000000000000000000000000000000000000..5a84dc12d8f713c6b8fa6d1b955ad0b09bb7dbdd --- /dev/null +++ b/Dockerfile.s390x @@ -0,0 +1,152 @@ +# Base UBI image for s390x architecture +ARG BASE_UBI_IMAGE_TAG=9.5-1736404155 +ARG PYTHON_VERSION=3.12 +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base + +# Install basic dependencies +ARG PYTHON_VERSION +ENV PYTHON_VERSION=${PYTHON_VERSION} + +WORKDIR /workspace + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +# Install development utilities +RUN microdnf install -y \ + which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \ + libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ + openssl-devel openblas openblas-devel autoconf automake libtool cmake && \ + microdnf clean all + +# Python Installation +FROM base AS python-install +ARG PYTHON_VERSION + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +ENV PYTHON_VERSION=${PYTHON_VERSION} +RUN microdnf install -y \ + python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \ + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all + +FROM python-install AS pyarrow + +# Build Apache Arrow +WORKDIR /tmp +RUN --mount=type=cache,target=/root/.cache/uv \ + git clone https://github.com/apache/arrow.git && \ + cd arrow/cpp && \ + mkdir release && cd release && \ + cmake -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=/usr/local \ + -DARROW_PYTHON=ON \ + -DARROW_PARQUET=ON \ + -DARROW_ORC=ON \ + -DARROW_FILESYSTEM=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_ZSTD=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_JSON=ON \ + -DARROW_CSV=ON \ + -DARROW_DATASET=ON \ + -DPROTOBUF_PROTOC_EXECUTABLE=/usr/bin/protoc \ + -DARROW_DEPENDENCY_SOURCE=BUNDLED \ + .. && \ + make -j$(nproc) && \ + make install && \ + cd ../../python && \ + export PYARROW_PARALLEL=4 && \ + export ARROW_BUILD_TYPE=release && \ + uv pip install -r requirements/build.txt && \ + python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel + +FROM python-install AS numa-build +# Install numactl (needed for numa.h dependency) +WORKDIR /tmp +RUN curl -LO https://github.com/numactl/numactl/archive/refs/tags/v2.0.16.tar.gz && \ + tar -xvzf v2.0.16.tar.gz && \ + cd numactl-2.0.16 && \ + ./autogen.sh && \ + ./configure && \ + make + +# Set include path +ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH" + +FROM python-install AS rust +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" + +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ + . "$CARGO_HOME/env" && \ + rustup default stable && \ + rustup show + +FROM python-install AS torch-vision +# Install torchvision +ARG TORCH_VERSION=2.7.0.dev20250304 +ARG TORCH_VISION_VERSION=v0.20.1 +WORKDIR /tmp +RUN --mount=type=cache,target=/root/.cache/uv \ + git clone https://github.com/pytorch/vision.git && \ + cd vision && \ + git checkout $TORCH_VISION_VERSION && \ + uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \ + python setup.py bdist_wheel + +# Final build stage +FROM python-install AS vllm-cpu +ARG PYTHON_VERSION + +# Set correct library path for torch and numactl +ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:$LD_LIBRARY_PATH" +ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH" +ENV UV_LINK_MODE=copy +ENV CARGO_HOME=/root/.cargo +ENV RUSTUP_HOME=/root/.rustup +ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" + +COPY . /workspace/vllm +WORKDIR /workspace/vllm + +RUN --mount=type=bind,from=numa-build,src=/tmp/numactl-2.0.16,target=/numactl \ + make -C /numactl install + +# Install dependencies, including PyTorch and Apache Arrow +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \ + --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ + --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \ + --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ + sed -i '/^torch/d' requirements/build.txt && \ + ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \ + VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \ + uv pip install -v \ + $ARROW_WHL_FILE \ + $VISION_WHL_FILE \ + --extra-index-url https://download.pytorch.org/whl/nightly/cpu \ + --index-strategy unsafe-best-match \ + -r requirements/build.txt \ + -r requirements/cpu.txt + +# Build and install vllm +RUN --mount=type=cache,target=/root/.cache/uv \ + VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \ + uv pip install "$(echo dist/*.whl)[tensorizer]" + +# setup non-root user for vllm +RUN umask 002 && \ + useradd --uid 2000 --gid 0 vllm && \ + mkdir -p /home/vllm && \ + chmod g+rwx /home/vllm + +COPY LICENSE /licenses/vllm.md +COPY examples/*.jinja /app/data/template/ + +USER 2000 +WORKDIR /home/vllm + +# Set the default entrypoint +ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file diff --git a/Dockerfile.tpu b/Dockerfile.tpu index e268b39476665e88e8d2d6941bcbb6309182439d..50806d8820a301990e403a2d2dec8dc66caca792 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -15,11 +15,14 @@ ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi +# Remove existing versions of dependencies +RUN pip uninstall -y torch torch_xla torchvision + ENV VLLM_TARGET_DEVICE="tpu" RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ python3 -m pip install \ - -r requirements-tpu.txt + -r requirements/tpu.txt RUN python3 setup.py develop # install development dependencies (for testing) diff --git a/Dockerfile.xpu b/Dockerfile.xpu index a374f20d7d949ee617ac1f257fc3db626ef0cff9..ad4abf16b43b6ae658192e9e2eab1e4bee5b531a 100644 --- a/Dockerfile.xpu +++ b/Dockerfile.xpu @@ -1,11 +1,7 @@ -FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base +# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually. +FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base -RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ - echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ - chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ - wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \ - echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ - chmod 644 /usr/share/keyrings/intel-graphics.gpg +RUN rm /etc/apt/sources.list.d/intel-graphics.list RUN apt-get update -y && \ apt-get install -y --no-install-recommends --fix-missing \ @@ -21,30 +17,20 @@ RUN apt-get update -y && \ python3 \ python3-dev \ python3-pip \ - # vim \ wget WORKDIR /workspace/vllm -COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt -COPY requirements-common.txt /workspace/vllm/requirements-common.txt +COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt +COPY requirements/common.txt /workspace/vllm/requirements/common.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install --no-cache-dir \ - -r requirements-xpu.txt - -RUN git clone https://github.com/intel/pti-gpu && \ - cd pti-gpu/sdk && \ - git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \ - mkdir build && \ - cd build && \ - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \ - make -j && \ - cmake --install . --config Release --prefix "/usr/local" + -r requirements/xpu.txt ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" COPY . . -ARG GIT_REPO_CHECK +ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi @@ -54,6 +40,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ python3 setup.py install +# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu +# FIXME: This will be fix in ipex 2.7. just leave this here for awareness. +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install intel-extension-for-pytorch==2.6.10+xpu \ + --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ + CMD ["/bin/bash"] FROM vllm-base AS vllm-openai diff --git a/MANIFEST.in b/MANIFEST.in index 82be639ef4d739ce67ff982ecbe82de09aae1afd..82fd22b845f099d01d95ae03cadd619dafdc4843 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,9 @@ include LICENSE -include requirements-common.txt -include requirements-cuda.txt -include requirements-rocm.txt -include requirements-neuron.txt -include requirements-cpu.txt +include requirements/common.txt +include requirements/cuda.txt +include requirements/rocm.txt +include requirements/neuron.txt +include requirements/cpu.txt include CMakeLists.txt recursive-include cmake * diff --git a/README.md b/README.md index ccdab1a3cdcbed16cd3a5162a5da4d7f0f3b368d..0131a3354ed97521dac154f521029811bb06cefa 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention高效管理kv内存,Continuous batching传入请求,支持很多Hugging Face模型,如LLaMA & LLaMA-2、Qwen、Chatglm2 & Chatglm3等。 ## 暂不支持的官方功能 -- **量化推理**:目前支持fp16的推理和gptq,awq-int4推理,marlin的权重量化、kv-cache fp8推理方案暂不支持 +- **量化推理**:目前不支持marlin的权重量化、kv-cache fp8推理方案 - **模块支持**:目前不支持Sliding window attention @@ -85,7 +85,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install (若调试,可使用V + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ ## 验证 -- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.7.3; +- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.8.2; ## Known Issue - 无 diff --git a/README_ORIGIN.md b/README_ORIGIN.md index d24cf6b79b87a6be3f114df4dd2c9b0560081c69..5bf51e32ae22e298b88f740ae635870f0cbe8e8e 100644 --- a/README_ORIGIN.md +++ b/README_ORIGIN.md @@ -10,20 +10,29 @@ Easy, fast, and cheap LLM serving for everyone
-| Documentation | Blog | Paper | Twitter/X | Developer Slack | +| Documentation | Blog | Paper | Twitter/X | User Forum | Developer Slack |
--- -We are excited to invite you to our Menlo Park meetup with Meta, evening of Thursday, February 27! Meta engineers will discuss the improvements on top of vLLM, and vLLM contributors will share updates from the v0.7.x series of releases. [Register Now](https://lu.ma/h7g3kuj9) +[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center! + +[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day) --- *Latest News* 🔥 +- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing). +- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0). +- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing). - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! + +| Dataset | +Online | +Offline | +Data Path | +
|---|---|---|---|
| ShareGPT | +✅ | +✅ | +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json |
+
| BurstGPT | +✅ | +✅ | +wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv |
+
| Sonnet | +✅ | +✅ | +Local file: benchmarks/sonnet.txt |
+
| Random | +✅ | +✅ | +synthetic |
+
| HuggingFace | +🟡 | +🟡 | +Specify your dataset path on HuggingFace | +
| VisionArena | +✅ | +✅ | +lmarena-ai/vision-arena-bench-v0.1 (a HuggingFace dataset) |
+
((const P**)&dp.ptrs[0], idx);
}
- multi_gpu_barrier (sg.signals[target]);
}
auto tmp_out = tmps[0];
- multi_gpu_barrier (ptrs, idx);
}
- multi_gpu_barrier