diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index e29eb78a9f9451bc990860605dd1d25da39c2d9a..a378bc6baa5a5e8b4327654ffd3d595445f1365a 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -1,12 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB # Note that we have 400 MiB quota, please use it wisely. # See https://github.com/pypi/support/issues/3792 . # Please also sync the value with the one in Dockerfile. -VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300)) +VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400)) def print_top_10_largest_files(zip_file): diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py index 8350e2705141e59cd613a902edba8a5421edd75a..36e1b6c01326aa136e3cbb3cf2f585697f77a50e 100644 --- a/.buildkite/generate_index.py +++ b/.buildkite/generate_index.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import os diff --git a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2928d75ce4469a0b7fa68ecac828847470e621e4 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml @@ -0,0 +1,11 @@ +# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2 +model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.6353 + - name: "exact_match,flexible-extract" + value: 0.637 +limit: null +num_fewshot: null diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index afc935c1a931832246680d34a4d9ea51aa8708b2..96e57dfd064758d59ca153473214912071bc2739 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ LM eval harness on model to compare vs HF baseline computed offline. Configs are found in configs/$MODEL.yaml diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 9d3646e2f6a15d3224ae1714ca0eeea0689f655b..e031686c7a293b5c8b86ab2c1ab255e1dbf48f68 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import os from pathlib import Path diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py index 68ac5909e59517fe2416c050aaba85c1bb44aef9..5e17b79d26a1ba4c735d9c61252d859c14e7eed2 100644 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse from transformers import AutoTokenizer diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py index 052060c576300333b7c822fcdfbd3ebe9667523d..0ff95a0911b16d57e7137fab28ca0ebca90113e4 100644 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import json from pathlib import Path diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py index 18bcc3a8714c48f6af69df8ead16927f601a939d..e5f179a0f5b68b4f684869bc652827f69d6266ef 100644 --- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py +++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from lmdeploy.serve.openai.api_client import APIClient api_client = APIClient("http://localhost:8000") diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 92d6fad73a94c0fe2c1821da908cccca8388c525..62ee5e10b5095fcdc2ea177450f163aa0102b33c 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import datetime import json import os diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh index 3e4e409466b8ace12b5e5daeb45d80ff74f8ab10..99972afa21d1e04b4b7da3e8b99622a81c89cc76 100644 --- a/.buildkite/run-gh200-test.sh +++ b/.buildkite/run-gh200-test.sh @@ -23,6 +23,6 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and test offline inference -docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' - python3 examples/offline_inference/basic.py +docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' + python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B ' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d5d02fdeb7f4b7a7685269c391aba037173e1080..7ef40564c5bd287c92900a41b552c3e7b6e01303 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -50,9 +50,9 @@ steps: - tests/multimodal - tests/test_utils - tests/worker - - tests/standalone_tests/lazy_torch_compile.py + - tests/standalone_tests/lazy_imports.py commands: - - python3 standalone_tests/lazy_torch_compile.py + - python3 standalone_tests/lazy_imports.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py @@ -128,6 +128,7 @@ steps: - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile - examples/offline_inference/rlhf.py + - examples/offline_inference/ray_placement.py commands: - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py @@ -136,6 +137,7 @@ steps: # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests - python3 ../examples/offline_inference/rlhf.py + - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/ray_placement.py - label: Metrics, Tracing Test # 10min num_gpus: 2 @@ -349,6 +351,7 @@ steps: - vllm/ - tests/models commands: + - pytest -v -s models/test_transformers.py - pytest -v -s models/test_registry.py - pytest -v -s models/test_initialization.py @@ -485,6 +488,7 @@ steps: - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml index 30db1721a9df71b4b7ef017208d7b6053cf8cb01..d4113da8b5b8168f089dd8288a4c45cf299a6d01 100644 --- a/.github/ISSUE_TEMPLATE/400-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml @@ -30,15 +30,6 @@ body: validations: required: true -- type: textarea - attributes: - label: Model Input Dumps - description: | - If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process. - placeholder: | - Upload the dumped input file. - validations: - required: false - type: textarea attributes: label: 🐛 Describe the bug diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml index df62539c0b3d9dfadf4537ed68ef906eb65c365f..27318c2fdd93f837bd93608cde7225807fd3c72f 100644 --- a/.github/workflows/reminder_comment.yml +++ b/.github/workflows/reminder_comment.yml @@ -2,7 +2,6 @@ name: PR Reminder Comment Bot on: pull_request_target: types: [opened] - jobs: pr_reminder: runs-on: ubuntu-latest @@ -15,7 +14,12 @@ jobs: owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, - body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀' + body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' + + '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' + + 'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' + + 'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' + + 'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' + + '🚀' }) env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ae518e1902f53469185b7fefc21530b21b70312d..4568efcbba211f623566c389752f45fdc716cb5d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -97,10 +97,14 @@ repos: language: system verbose: true stages: [commit-msg] + - id: check-spdx-header + name: Check SPDX headers + entry: python tools/check_spdx_header.py + language: python + types: [python] - id: suggestion name: Suggestion entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."' language: system verbose: true pass_filenames: false - diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f801b5f8f55133dd75c41d8c8494a5ce774ffb98..1a9596841cc65a85d080b6988a90b885a23a6184 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -61,7 +61,7 @@ representative at an online or offline/IRL event. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement in the #code-of-conduct -channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g). +channel in the [vLLM Slack](https://slack.vllm.ai). All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the diff --git a/Dockerfile b/Dockerfile index 0b9f74e08dc68c878d9ffbc0d59e8920eaac8c8b..7ecb643f46272ae64ee318a6c313ec0b73eea68d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -127,7 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py -ARG VLLM_MAX_SIZE_MB=300 +ARG VLLM_MAX_SIZE_MB=400 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ARG RUN_WHEEL_CHECK=true RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ diff --git a/README.md b/README.md index 04462a21e638cccf701aca6c9a260665adda60ed..be5017f2bf14727263c7e015bd9b6422984945b7 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install (若调试,可使用V + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ ## 验证 -- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.7.1; +- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.7.2; ## Known Issue - 无 diff --git a/README_ORIGIN.md b/README_ORIGIN.md index db548db5a0522135be477e9b24dabddd83d3b2e6..c02404748820a55cc39031a9fd49a501d21f5628 100644 --- a/README_ORIGIN.md +++ b/README_ORIGIN.md @@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone

-| Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack | +| Documentation | Blog | Paper | Twitter/X | Developer Slack |

--- @@ -36,7 +36,7 @@ Easy, fast, and cheap LLM serving for everyone ## About vLLM is a fast and easy-to-use library for LLM inference and serving. -Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry. +Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry. vLLM is fast with: @@ -139,8 +139,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ## Contact Us * For technical questions and feature requests, please use Github issues or discussions. -* For discussing with fellow users, please use Discord. -* For coordinating contributions and development, please use Slack. +* For discussing with fellow users and coordinating contributions and development, please use Slack. * For security disclosures, please use Github's security advisory feature. * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu. diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 0612e8778aca5f93462a70cf9ca9db863b63f3f8..364b087b841d38ac80c2007a0d774c81f00cd51c 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import os import sys diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py index 6d787a50680db1be75f02da472762ca3dd32d397..d6a626a534b19fb578c8ec83354a7b8af3c43f85 100644 --- a/benchmarks/benchmark_guided.py +++ b/benchmarks/benchmark_guided.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark guided decoding throughput.""" import argparse import dataclasses diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index e0bc1e47ef5498d300bf43db3c86a0812f7f92e9..aec511d77ae320c5b6ec7765c6d47787da808162 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark the latency of processing a single batch of requests.""" import argparse import dataclasses diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 0b8fba38156f109220a7b10c7cda23a535e4c189..21480578edbd5212b9b362ee4d2fa336f44fccef 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Offline benchmark to test the long document QA throughput. diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 843ad3d71a53e6ee31508be563e88c15a71bc945..e74cf338f57bfde6e4d7829e6b7a159b4fad2b51 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Benchmark the efficiency of prefix caching. diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index d885c3005a7ed775755ab8632aa90aa3b1dca446..54b646919fce565a2f031e7fe46e019ffc4383b5 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark offline prioritization.""" import argparse import dataclasses diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 8b3212831e7e09672eaa006379e4494000e55c34..e934d228f7fd46d27f7a34de1750d1713ade9425 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 r"""Benchmark online serving throughput. On the server side, run one of the following commands: diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py index 4435d87e18a8a18ae9ae77668eb6508c9eb1fe76..561e500d8b6c493861bb2c10c445bf643782d573 100644 --- a/benchmarks/benchmark_serving_guided.py +++ b/benchmarks/benchmark_serving_guided.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 r"""Benchmark online serving throughput with guided decoding. On the server side, run one of the following commands: diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 7568da7b2ba678f647d997feeb6335462465d7f8..62b4502670d52212ef42bd61edf5504c702fe504 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Benchmark offline inference throughput.""" import argparse import dataclasses diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index 3d1c5e392f9e2d6d73eec429e52e949d940cc189..468a1b2868f0c3f6bcc36dfd8ff871c3d95e1dec 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import itertools diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py index ef06fcd6604ddd210cdf02052a76538564c14357..bab377800729b2e2b99ae7747bd4607202d6cfca 100644 --- a/benchmarks/cutlass_benchmarks/utils.py +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Cutlass bench utils from typing import Iterable, Tuple diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index b87496ca3b2b4d7510c550e898322e4dcef0af44..6552b62dae8814e3a67762cdef7ef40a62416ec6 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import itertools diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py index d58fb0bf86374dd45d8d8217c3ad3cfb1be125e9..3d1121df40d01c4b051cb3ce6abac2ed0921a9ea 100644 --- a/benchmarks/cutlass_benchmarks/weight_shapes.py +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) # Example: diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index 4058b1c0a3b79928ca0d365604fc2dcd2b3ad985..980e68668911f7bd28a5b7c5e87f9781966bfbfc 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import aiohttp diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py index 6eb5f639800701cb376bcc3e534a0f96737d1b86..c2ad4916bf0775ab4543afeb50ad24594cb65fee 100644 --- a/benchmarks/disagg_benchmarks/round_robin_proxy.py +++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import itertools diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py index e59d8bb0e6c8c16aff5cee94cce69ac4698099f9..a7b4b9e8bf302975458a675a1710e9eb653c1551 100644 --- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py +++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import matplotlib.pyplot as plt diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index ef91f9f8eb52965a54cd00c6a96e5fbd19d24ade..c56cc743845e9d511edf6cbf968bf8ce86279cf2 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle as pkl import time from dataclasses import dataclass diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index 601c4ea439aea7f199c26bb9dfb20f46ef45f850..8d20b91560dd62cb0c404e813da4d64fe48dda69 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import sys from typing import Optional diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index 7acea6087fdfd314ba06701db97e0cdf803cec7d..d265c91bfeffceb60edcc2f6ed138fdd77bd4482 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time import torch diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index e1f613e1da50964b6d678be2cd183dca08f90726..ecde8fbaa15b87e012c5bce3d81273098ca1190d 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import json diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 46bab74ae8adfe7598a093c60165223049a3859f..0301fee1a88640d4dea2f64c0b7f6970c65a8ff2 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import itertools diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 8fb44e3a3dbd8515be7e8a9bed38d256add4b2a1..c22e66c0b0c94dc72681d276d58a954109b48f23 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import torch diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 860abe080f3c40a76b517fc235b32a79af7b08f1..4d2f8d140e74ebc83b9787065f0b2ed1d7a9cdcb 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import time from datetime import datetime diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index d11874cd04bf4b440ffa4a77f82389951ca1e990..df8bec6174491d18b1e9dc228358d9c38ac1b801 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random import time from typing import List, Optional diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 1d62483448946c172ae7dbf944324f776c2d8d10..0ddea9390d7779dd3608864c22013b1798fa19d5 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time import torch diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index baa5de0fff1bdaaebe95f621903e8d6b5a721d4a..dba153742da4f30371a5193c3cb6a6231c129043 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from typing import Optional, Tuple, Union diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 250d505168d0989201f458d13c208e0f71a56b15..8ee0212a0c11dcffcc4979682d0bd5b8161f1acd 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import accumulate from typing import List, Optional diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py index 4eeeca35a37cc031f2a810946733d488357fd26d..c375e61e418731cacbfa93975bf65452a12398ad 100644 --- a/benchmarks/kernels/benchmark_shapes.py +++ b/benchmarks/kernels/benchmark_shapes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + WEIGHT_SHAPES = { "ideal": [[4 * 256 * 32, 256 * 32]], "mistralai/Mistral-7B-v0.1/TP1": [ diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index 7d0bd84150a27886a07acbb8cf913245f2d8742e..01d97d63d7cf09be408928784857746acdd29288 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math import pickle import re diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index fee877b6f76facf7b7b5ef05ec29439f70a22ce5..7281707484921e7f358d046c437b6a02697ec612 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import Any, Callable, Iterable, Optional diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py index 51f24f3ba17746ad6970b5053731ef464cab7365..89b05d5882a381ce789230a64672811382ebdb8b 100644 --- a/benchmarks/kernels/weight_shapes.py +++ b/benchmarks/kernels/weight_shapes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) # Example: diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index d16d6f9fba44213c1a85d2c0ce12d4ac5bd54b84..5f94552e9dc85233b82daf85b162aec47f287284 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import cProfile import pstats diff --git a/cmake/hipify.py b/cmake/hipify.py index 340e41c8179e33c88aec4eee21efba7febbf65b9..a15577125eb1fef3584fe3ce11594dd353dd2071 100755 --- a/cmake/hipify.py +++ b/cmake/hipify.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 # # A command line tool for running pytorch's hipify preprocessor on CUDA diff --git a/collect_env.py b/collect_env.py index 254c19b19a5acdb994cce87d02e68055e1f83a3a..0ec9d4cae4ba7a96866251167a2db3dfbd5020d4 100644 --- a/collect_env.py +++ b/collect_env.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py diff --git a/csrc/cache.h b/csrc/cache.h index d99f3bf3175aba99da07cc6c300b62b32c333399..cdf671b8d28668c0faab35352db4b80059426c74 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -15,6 +15,9 @@ void copy_blocks(std::vector const& key_caches, std::vector const& value_caches, const torch::Tensor& block_mapping); +void copy_blocks_mla(std::vector const& kv_caches, + const torch::Tensor& block_mapping); + void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 8c4bdd314b7ebc71ad495a86785661dac5b8f685..7792a689427eff26cb50d8c420072f4b9ed08662 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -46,7 +46,10 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst, char* src_ptr = static_cast(src.data_ptr()); char* dst_ptr = static_cast(dst.data_ptr()); - const int64_t block_size_in_bytes = src.element_size() * src[0].numel(); + // We use the stride instead of numel in case the cache is padded for memory + // alignment reasons, we assume the blocks data (inclusive of any padding) + // is contiguous in memory + const int64_t block_size_in_bytes = src.element_size() * src.stride(0); const at::cuda::OptionalCUDAGuard device_guard( src_device.is_cuda() ? src_device : dst_device); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); @@ -93,6 +96,24 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs, } } +// Kernel for MLA, which works on a single joint kv_cache +// Grid: (num_layers, num_pairs) +template +__global__ void copy_blocks_mla_kernel( + int64_t* cache_ptrs, const int64_t* __restrict__ block_mapping, + const int mem_footprint_per_block) { + const int layer_idx = blockIdx.x; + const int pair_idx = blockIdx.y; + scalar_t* cache = reinterpret_cast(cache_ptrs[layer_idx]); + int64_t src_block = block_mapping[2 * pair_idx]; + int64_t dst_block = block_mapping[2 * pair_idx + 1]; + int64_t src_offset = src_block * mem_footprint_per_block; + int64_t dst_offset = dst_block * mem_footprint_per_block; + for (int i = threadIdx.x; i < mem_footprint_per_block; i += blockDim.x) { + cache[dst_offset + i] = cache[src_offset + i]; + } +} + } // namespace vllm // Note: the key_caches and value_caches vectors are constant but @@ -147,6 +168,42 @@ void copy_blocks(std::vector const& key_caches, })); } +// copy blocks kernel for MLA (assumes a joint KV-cache) +void copy_blocks_mla(std::vector const& kv_caches, + const torch::Tensor& block_mapping) { + int num_layers = kv_caches.size(); + if (num_layers == 0) { + return; + } + torch::Device cache_device = kv_caches[0].device(); + TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA"); + + std::vector cache_ptrs(num_layers); + for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) { + cache_ptrs[layer_idx] = + reinterpret_cast(kv_caches[layer_idx].data_ptr()); + } + torch::Tensor cache_ptrs_tensor = + torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64) + .to(cache_device); + + int num_pairs = block_mapping.size(0); + // We use the stride instead of numel in case the cache is padded for memory + // alignment reasons, we assume the blocks data (inclusive of any padding) + // is contiguous in memory + int mem_footprint_per_block = kv_caches[0].stride(0); + dim3 grid(num_layers, num_pairs); + dim3 block(std::min(1024, mem_footprint_per_block)); + const at::cuda::OptionalCUDAGuard device_guard(cache_device); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( + kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] { + vllm::copy_blocks_mla_kernel<<>>( + cache_ptrs_tensor.data_ptr(), + block_mapping.data_ptr(), mem_footprint_per_block); + })); +} + namespace vllm { template @@ -382,6 +439,7 @@ __global__ void concat_and_cache_mla_kernel( // + pe_dim)] const int64_t* __restrict__ slot_mapping, // [num_tokens] const int block_stride, // + const int entry_stride, // const int kv_c_stride, // const int k_pe_stride, // const int kv_lora_rank, // @@ -402,9 +460,8 @@ __global__ void concat_and_cache_mla_kernel( int src_stride, int dst_stride, int size, int offset) { for (int i = threadIdx.x; i < size; i += blockDim.x) { const int64_t src_idx = token_idx * src_stride + i; - const int64_t dst_idx = block_idx * block_stride + - block_offset * (kv_lora_rank + pe_dim) + i + - offset; + const int64_t dst_idx = + block_idx * block_stride + block_offset * entry_stride + i + offset; if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { dst[dst_idx] = src[src_idx]; } else { @@ -660,16 +717,14 @@ void write_cache_multi_layers( CALL_WRITE_CACHE_MULTI_LAYERS); } - - -#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \ - vllm::concat_and_cache_mla_kernel \ - <<>>( \ - reinterpret_cast(kv_c.data_ptr()), \ - reinterpret_cast(k_pe.data_ptr()), \ - reinterpret_cast(kv_cache.data_ptr()), \ - slot_mapping.data_ptr(), block_stride, kv_c_stride, \ - k_pe_stride, kv_lora_rank, pe_dim, block_size, \ +#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \ + vllm::concat_and_cache_mla_kernel \ + <<>>( \ + reinterpret_cast(kv_c.data_ptr()), \ + reinterpret_cast(k_pe.data_ptr()), \ + reinterpret_cast(kv_cache.data_ptr()), \ + slot_mapping.data_ptr(), block_stride, entry_stride, \ + kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \ reinterpret_cast(scale.data_ptr())); void concat_and_cache_mla( @@ -699,6 +754,7 @@ void concat_and_cache_mla( int kv_c_stride = kv_c.stride(0); int k_pe_stride = k_pe.stride(0); int block_stride = kv_cache.stride(0); + int entry_stride = kv_cache.stride(1); dim3 grid(num_tokens); dim3 block(std::min(kv_lora_rank, 512)); diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index b401736c9824b49b74c1c3c2718f2289f0e61801..d5a5e2ef83dd808fc4b0815784b05488c3e4888b 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum from typing import Dict, Union diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 7e0a25afbfec49ab658a7c284c23371563b7d7a6..379c184af3465f84240c7a4f2fa87fc944d6d311 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -197,6 +197,72 @@ __global__ void moe_align_block_size_global_mem_kernel( } } +// taken from +// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a +template +__global__ void sgl_moe_align_block_size_kernel( + scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, + int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts, + int32_t block_size, size_t numel, int32_t* cumsum) { + __shared__ int32_t shared_counts[32][8]; + __shared__ int32_t local_offsets[256]; + + const int warp_id = threadIdx.x / 32; + const int lane_id = threadIdx.x % 32; + const int experts_per_warp = 8; + const int my_expert_start = warp_id * experts_per_warp; + + for (int i = 0; i < experts_per_warp; ++i) { + if (my_expert_start + i < num_experts) { + shared_counts[warp_id][i] = 0; + } + } + + const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); + const size_t start_idx = threadIdx.x * tokens_per_thread; + + for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { + int expert_id = topk_ids[i]; + int warp_idx = expert_id / experts_per_warp; + int expert_offset = expert_id % experts_per_warp; + atomicAdd(&shared_counts[warp_idx][expert_offset], 1); + } + + __syncthreads(); + + if (threadIdx.x == 0) { + cumsum[0] = 0; + for (int i = 1; i <= num_experts; ++i) { + int expert_count = 0; + int warp_idx = (i - 1) / experts_per_warp; + int expert_offset = (i - 1) % experts_per_warp; + expert_count = shared_counts[warp_idx][expert_offset]; + + cumsum[i] = + cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size; + } + *total_tokens_post_pad = cumsum[num_experts]; + } + + __syncthreads(); + + if (threadIdx.x < num_experts) { + for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; + i += block_size) { + expert_ids[i / block_size] = threadIdx.x; + } + local_offsets[threadIdx.x] = cumsum[threadIdx.x]; + } + + __syncthreads(); + + for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { + int32_t expert_id = topk_ids[i]; + int32_t rank_post_pad = atomicAdd(&local_offsets[expert_id], 1); + sorted_token_ids[rank_post_pad] = i; + } +} + template __global__ void moe_sum_kernel( scalar_t* __restrict__ out, // [..., d] @@ -305,6 +371,32 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, } } +void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, + int64_t block_size, + torch::Tensor sorted_token_ids, + torch::Tensor experts_ids, + torch::Tensor num_tokens_post_pad) { + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_INTEGRAL_TYPES( + topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] { + // calc needed amount of shared mem for `tokens_cnts` and `cumsum` + // tensors + auto options_int = + torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); + // torch::Tensor token_cnts_buffer = + // torch::empty({(num_experts + 1) * num_experts}, options_int); + torch::Tensor cumsum_buffer = + torch::empty({num_experts + 1}, options_int); + + auto kernel = vllm::moe::sgl_moe_align_block_size_kernel; + kernel<<<1, 1024, 0, stream>>>( + topk_ids.data_ptr(), sorted_token_ids.data_ptr(), + experts_ids.data_ptr(), + num_tokens_post_pad.data_ptr(), num_experts, block_size, + topk_ids.numel(), cumsum_buffer.data_ptr()); + }); +} + void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size] torch::Tensor& output) // [num_tokens, hidden_size] { diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index 596cc0aa6c855d24cb2a7f5723ecd81e070a6819..66bb5f41b7f783d562715df9fdb0d0f150efe85a 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -12,3 +12,9 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int64_t block_size, torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad); + +void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, + int64_t block_size, + torch::Tensor sorted_token_ids, + torch::Tensor experts_ids, + torch::Tensor num_tokens_post_pad); diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index f3a558c14ab93f76e01e0a3c754bdf6746085cde..8540633dcc8b0839dc287d3ad7edc4367a3b6861 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -22,6 +22,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { " Tensor! num_tokens_post_pad) -> ()"); m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); + // temporarily adapted from + // https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a + m.def( + "sgl_moe_align_block_size(Tensor topk_ids, int num_experts," + " int block_size, Tensor! sorted_token_ids," + " Tensor! experts_ids," + " Tensor! num_tokens_post_pad) -> ()"); + m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size); + #ifndef USE_ROCM m.def( "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, " diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu index 72d549e597df58e100b9be1d2fee13650b2474c8..e40f2822996855caffe228e0a21084b6abb7ae40 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu @@ -16,29 +16,11 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); - using GroupShape = std::array; - int M = a.size(0), N = b.size(1), K = a.size(1); - GroupShape a_scale_group_shape = [&, &s = a_scales]() -> GroupShape { - if (s.numel() == 1) return {M, K}; // tensor-wise - if (s.dim() == 2) - return {ceil_div(a.size(0), s.size(0)), ceil_div(a.size(1), s.size(1))}; - TORCH_CHECK(false, "Unsupported scale shape for scale_a"); - }(); - - GroupShape b_scale_group_shape = [&, &s = b_scales]() -> GroupShape { - if (s.numel() == 1) return {K, N}; // tensor-wise - if (s.dim() == 2) - return {ceil_div(b.size(0), s.size(0)), ceil_div(b.size(1), s.size(1))}; - TORCH_CHECK(false, "Unsupported scale shape for scale_b"); - }(); - - if ((a_scale_group_shape == GroupShape{M, K} || - a_scale_group_shape == GroupShape{1, K}) && - (b_scale_group_shape == GroupShape{K, N} || - b_scale_group_shape == GroupShape{K, 1})) { - // "standard per-tensor/per-token/per-channel" scaling + if ((a_scales.numel() == 1 || a_scales.numel() == a.size(0)) && + (b_scales.numel() == 1 || b_scales.numel() == b.size(1))) { + // Standard per-tensor/per-token/per-channel scaling TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); if (a.dtype() == torch::kFloat8_e4m3fn) { vllm::cutlass_scaled_mm_sm90_fp8(c, a, b, a_scales, b_scales, bias); @@ -46,25 +28,32 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, TORCH_CHECK(a.dtype() == torch::kInt8); vllm::cutlass_scaled_mm_sm90_int8(c, a, b, a_scales, b_scales, bias); } - } else if (a_scale_group_shape == GroupShape{1, 128} && - b_scale_group_shape == GroupShape{128, 128}) { + } else { + using GroupShape = std::array; + auto make_group_shape = [](torch::Tensor const& x, + torch::Tensor const& s) -> GroupShape { + TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D"); + return {ceil_div(x.size(0), s.size(0)), ceil_div(x.size(1), s.size(1))}; + }; + + GroupShape a_scale_group_shape = make_group_shape(a, a_scales); + GroupShape b_scale_group_shape = make_group_shape(b, b_scales); + // 1x128 per-token group scales for activations // 128x128 blockwise scales for weights - TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn && - b.dtype() == torch::kFloat8_e4m3fn, - "Currently only FP8 is supported for A group shape 1x128 and " - "B group shape 128x128"); - TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm"); - - vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales); - } else { - TORCH_CHECK(false, - "Unsupported scale group shapes for CUTLASS 3.x GEMM.\n " - "a_scale_group_shape must be [1, 128], got: [", + TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} && + b_scale_group_shape == GroupShape{128, 128} && + a.dtype() == torch::kFloat8_e4m3fn && + b.dtype() == torch::kFloat8_e4m3fn), + "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n" + "a_scale_group_shape must be [1, 128]. Got: [", a_scale_group_shape[0], ", ", a_scale_group_shape[1], "]\n" - "b_scale_group_shape must be [128, 128], got: [", + "b_scale_group_shape must be [128, 128]. Got: [", b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]"); + TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm"); + + vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales); } } diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index a9b5ddf4cbdd2cc6c03dafc5667aa26aad1c2316..02e59fe28b9af0e4aec5c0d21957e8af9948da2e 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools import math import os diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 01f46d7228c07cc60ba2ae67597c42e31c5f8d5b..9a3a45baa1f743711771124d87be02f89518838e 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -640,6 +640,10 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { "Tensor block_mapping) -> ()"); cache_ops.impl("copy_blocks", torch::kCUDA, ©_blocks); + cache_ops.def( + "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()"); + cache_ops.impl("copy_blocks_mla", torch::kCUDA, ©_blocks_mla); + // Reshape the key and value tensors and cache them. cache_ops.def( "reshape_and_cache(Tensor key, Tensor value," diff --git a/docs/source/conf.py b/docs/source/conf.py index 6b0a1dad142b71c884b35e899d2532ba84cb9962..f4e8c8b949102c170c38e0c79ecac1e0ff4d578f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full @@ -35,7 +37,6 @@ author = 'the vLLM Team' # ones. extensions = [ "sphinx.ext.napoleon", - "sphinx.ext.viewcode", "sphinx.ext.linkcode", "sphinx.ext.intersphinx", "sphinx_copybutton", diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md index 6c6f3b701cd285751967d15256acc68df142707e..66a7554da846379201a17fcbd24db695216913a9 100644 --- a/docs/source/contributing/model/multimodal.md +++ b/docs/source/contributing/model/multimodal.md @@ -250,7 +250,11 @@ def get_max_image_tokens(self) -> int: And thus, we can override the method as: ```python -def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: +def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], +) -> Mapping[str, int]: return {"image": self.get_max_image_tokens()} ``` diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md index 30735b1161ff36622b199d36865ce8852b937753..fa0bebeb8ba1c9c3653ce444200c5c9c1f08d187 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -2,12 +2,6 @@ # AutoAWQ -:::{warning} -Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better -accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency -inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version. -::: - To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. The main benefits are lower latency and memory usage. diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index da87127057dc557b77b3cc0a2f2756a8825e2fce..1e468962cc9c507562e5f1ece3f9df349700a79b 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -131,7 +131,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) llm = LLM( model="meta-llama/Meta-Llama-3.1-70B-Instruct", tensor_parallel_size=4, - speculative_model="ibm-fms/llama3-70b-accelerator", + speculative_model="ibm-ai-platform/llama3-70b-accelerator", speculative_draft_tensor_parallel_size=1, ) outputs = llm.generate(prompts, sampling_params) @@ -149,11 +149,11 @@ limitation will be fixed in a future release. A variety of speculative models of this type are available on HF hub: -- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator) -- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator) -- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator) -- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator) -- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator) +- [llama-13b-accelerator](https://huggingface.co/ibm-ai-platform/llama-13b-accelerator) +- [llama3-8b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-8b-accelerator) +- [codellama-34b-accelerator](https://huggingface.co/ibm-ai-platform/codellama-34b-accelerator) +- [llama2-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama2-70b-accelerator) +- [llama3-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-70b-accelerator) - [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator) - [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator) - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator) diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index ac592e22328da44c92bcabf113d87bf1fa599bfa..9d4de18a3b79dcc1a00700aa86a442040250c5f2 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools import re from dataclasses import dataclass, field diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md index 4116826789e5c818e23c11e5eeb08159b4a45ebb..ef02d9a078a1ba0cc6ee85ab5375aebe57982bac 100644 --- a/docs/source/getting_started/installation/gpu/xpu.inc.md +++ b/docs/source/getting_started/installation/gpu/xpu.inc.md @@ -36,7 +36,7 @@ VLLM_TARGET_DEVICE=xpu python setup.py install :::{note} - FP16 is the default data type in the current XPU backend. The BF16 data - type will be supported in the future. + type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet. ::: ## Set up using Docker diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 5fe940abc45534c31da22c371dfc08e6f9f828f8..d420ec7a2fa16ab8b1ab6a320590f5d9b39542a8 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -40,6 +40,82 @@ If vLLM successfully returns text (for generative models) or hidden states (for Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM. Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. +### Transformers fallback + +After the merge of , `vllm` can fallback to models that are available in `transformers`. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned! + +To check if the backend is `transformers`, you can simply do this: + +```python +from vllm import LLM +llm = LLM(model=..., task="generate") # Name or path of your model +llm.apply_model(lambda model: print(model.__class__)) +``` + +If it is `TransformersModel` then it means it's based on `transformers`! + +#### Supported features + +##### LORA and quantization + +Both are not supported yet! Make sure to open an issue and we'll work on this together with the `transformers` team! + +Usually `transformers` model load weights via the `load_adapters` API, that depends on PEFT. We need to work a bit to either use this api (for now this would result in some weights not being marked as loaded) or replace modules accordingly. + +Hints as to how this would look like: + +```python +class TransformersModel(nn.Module, SupportsLoRA): + def __init__(*): + ... + self.model.load_adapter(vllm_config.load_config.model_loader_extra_config["qlora_adapter_name_or_path"]) +``` + +Blocker is that you need to specify supported lora layers, when we would ideally want to load whatever is inside the checkpoint! + +##### Remote code + +This fallback also means that any model on the hub that can be used in `transformers` with `trust_remote_code=True` that correctly implements attention can be used in production! + +```python +from vllm import LLM +llm = LLM(model=..., task="generate", trust_remote_code=True) # Name or path of your model +llm.apply_model(lambda model: print(model.__class__)) +``` + +A model just needs the following two things: + +```python +from transformers import PreTrainedModel +from torch import nn + +class MyAttention(nn.Module): + + def forward(self, hidden_states, **kwargs): # <- kwargs are required + + ... + attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + **kwargs, + ) + ... + +class MyModel(PreTrainedModel): + _supports_attention_backend = True +``` + +Here is what happens in the background: + +1. The config is loaded +2. `MyModel` python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`. +3. The `TransformersModel` backend is used. See `/model_executors/models/transformers`, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`. + +That's it! + ### ModelScope To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable: @@ -650,14 +726,14 @@ See [this page](#generative-models) for more information on how to use generativ * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. * * ✅︎ - * + * \* - * `Idefics3ForConditionalGeneration` * Idefics3 * T + I * `HuggingFaceM4/Idefics3-8B-Llama3` etc. * ✅︎ * - * + * ✅︎ - * `InternVLChatModel` * InternVL 2.5, Mono-InternVL, InternVL 2.0 * T + IE+ @@ -723,7 +799,7 @@ See [this page](#generative-models) for more information on how to use generativ * ✅︎ - * `NVLM_D_Model` * NVLM-D 1.0 - * T + IE+ + * T + I+ * `nvidia/NVLM-D-72B`, etc. * * ✅︎ @@ -770,11 +846,18 @@ See [this page](#generative-models) for more information on how to use generativ * ✅︎ * ✅︎ * ✅︎ +- * `Qwen2_5_VLForConditionalGeneration` + * Qwen2.5-VL + * T + IE+ + VE+ + * `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. + * + * ✅︎ + * ✅︎ - * `UltravoxModel` * Ultravox * T + AE+ * `fixie-ai/ultravox-v0_3` - * + * ✅︎ * ✅︎ * ✅︎ ::: @@ -783,7 +866,11 @@ See [this page](#generative-models) for more information on how to use generativ + Multiple items can be inputted per text prompt for this modality. :::{note} -To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. +To use DeepSeek-VL2 series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. +::: + +:::{note} +H2O-VL series models will be available in V1 once we support backends other than FlashAttention. ::: :::{note} @@ -796,8 +883,11 @@ For more details, please see: ::: :::{note} -The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)). -A corrected version is available at . +`mistral-community/pixtral-12b` does not support V1 yet. +::: + +:::{note} +To use Qwen2.5-VL series models, you have to install Huggingface `transformers` library from source via `pip install git+https://github.com/huggingface/transformers`. ::: ### Pooling Models diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index 3f9ca27eb438ea6705b5c69d40b3e24d79e7d59e..6d136147c8dd2e654a110340df8c880deeabad6b 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -60,7 +60,8 @@ bash run_cluster.sh \ vllm/vllm-openai \ ip_of_head_node \ --head \ - /path/to/the/huggingface/home/in/this/node + /path/to/the/huggingface/home/in/this/node \ + -e VLLM_HOST_IP=ip_of_this_node ``` On the rest of the worker nodes, run the following command: @@ -70,10 +71,11 @@ bash run_cluster.sh \ vllm/vllm-openai \ ip_of_head_node \ --worker \ - /path/to/the/huggingface/home/in/this/node + /path/to/the/huggingface/home/in/this/node \ + -e VLLM_HOST_IP=ip_of_this_node ``` -Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. +Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses. Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. @@ -103,3 +105,7 @@ Please make sure you downloaded the model to all the nodes (with the same path), When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model. ::: + +:::{warning} +If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` to see the IP address used by Ray. See for more information. +::: diff --git a/examples/offline_inference/aqlm_example.py b/examples/offline_inference/aqlm_example.py index 40f9a21ec9e51287f9955a32a17dce910a1662a1..e8db3811ff1712349c277e2243760dceb78fa3a7 100644 --- a/examples/offline_inference/aqlm_example.py +++ b/examples/offline_inference/aqlm_example.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams from vllm.utils import FlexibleArgumentParser diff --git a/examples/offline_inference/arctic.py b/examples/offline_inference/arctic.py index 1fec3c99eb47c417661773ac416514536580ab54..90c88446c5146fc1d02a070798222bc7a95b3317 100644 --- a/examples/offline_inference/arctic.py +++ b/examples/offline_inference/arctic.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams # Sample prompts. diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 5952ec13ec3cbed2a3abb49f0b0f52d46f1f2fe5..707ca9f878961a8393ec11015a367654b5e5e2a8 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use vLLM for running offline inference with the correct prompt format on audio language models. diff --git a/examples/offline_inference/basic.py b/examples/offline_inference/basic.py index 63ddf1079aa24a793fab692415e115ebdd77105b..b3a8fc7b87eec1399dbe9c59228c8ff7dbf83866 100644 --- a/examples/offline_inference/basic.py +++ b/examples/offline_inference/basic.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams if __name__ == '__main__': diff --git a/examples/offline_inference/basic_with_model_default_sampling.py b/examples/offline_inference/basic_with_model_default_sampling.py index 346bb80b1e23f59d487de26b778a0ef871469b40..80de9428f6a9a8a7c44eaf646a7aa66a6d3b632a 100644 --- a/examples/offline_inference/basic_with_model_default_sampling.py +++ b/examples/offline_inference/basic_with_model_default_sampling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM # Sample prompts. diff --git a/examples/offline_inference/chat.py b/examples/offline_inference/chat.py index 8814f4d7bef0de945e01ab582122e0a2d50e2f58..dbc710cc8a0b73815adc0b1eaa4a8186779f2ad9 100644 --- a/examples/offline_inference/chat.py +++ b/examples/offline_inference/chat.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py index e69a6c067e4da3ccf34056c8ff24b097cca67a38..15519bfed9cb49bbe893557ee7e3f2604a04e3d9 100644 --- a/examples/offline_inference/chat_with_tools.py +++ b/examples/offline_inference/chat_with_tools.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa import json import random diff --git a/examples/offline_inference/classification.py b/examples/offline_inference/classification.py index de539b639a19611db31dda39f1fcc3e4dbdab04b..4a364aeb8c47b81c344b7e3a3376b43802cc4ad1 100644 --- a/examples/offline_inference/classification.py +++ b/examples/offline_inference/classification.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM # Sample prompts. diff --git a/examples/offline_inference/cli.py b/examples/offline_inference/cli.py index 391ac6b9b6b03ce0214f935ae2a057dd95b1c263..bc6833b3f39c5c473ebaa1f4a7b43c72c17bc00d 100644 --- a/examples/offline_inference/cli.py +++ b/examples/offline_inference/cli.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import asdict from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/cpu_offload.py b/examples/offline_inference/cpu_offload.py index b152e5bc37e6da2a9d4a2b43d6f685e381d5bea1..5511eb738778a9dfdf941ac1e96448b05db1f9a8 100644 --- a/examples/offline_inference/cpu_offload.py +++ b/examples/offline_inference/cpu_offload.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams # Sample prompts. diff --git a/examples/offline_inference/distributed.py b/examples/offline_inference/distributed.py index 677127844ccdde6cf36b91f3eb007e61f219d36a..a2df41d4ce21b0c255f1e7bc19becf6d2cafd23f 100644 --- a/examples/offline_inference/distributed.py +++ b/examples/offline_inference/distributed.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use Ray Data for running offline batch inference distributively on a multi-nodes cluster. diff --git a/examples/offline_inference/embedding.py b/examples/offline_inference/embedding.py index 58d004313ad518c9a72ddd2e47455b51a6493964..f9399329d24f32411915a39cf6dd68c195ec76c0 100644 --- a/examples/offline_inference/embedding.py +++ b/examples/offline_inference/embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM # Sample prompts. diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py index 0f266d7918853e02ae7e94c7319357e8fab2ae99..8765d1812cc53a008099e6a1de814263a3802d98 100644 --- a/examples/offline_inference/encoder_decoder.py +++ b/examples/offline_inference/encoder_decoder.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 ''' Demonstrate prompting of text-to-text encoder/decoder models, specifically BART diff --git a/examples/offline_inference/florence2_inference.py b/examples/offline_inference/florence2_inference.py index c24096e90004b4b1ada6db0ea6b3f8d09c51bb68..58610b0fd2a5122d07279fb379a1eb4fc1c5c0b1 100644 --- a/examples/offline_inference/florence2_inference.py +++ b/examples/offline_inference/florence2_inference.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 ''' Demonstrate prompting of text-to-text encoder/decoder models, specifically Florence-2 diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py index aa05c4c0bfaa5f04d2206d323236d3f732e36fce..0447e74e0d6f6272d1920658027016f66c18985a 100644 --- a/examples/offline_inference/gguf_inference.py +++ b/examples/offline_inference/gguf_inference.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from huggingface_hub import hf_hub_download from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py index 60d894aae96920e783ef3ff16f4872fc95df6ec4..501034c1cc5da471a466fdb9b53a4bf3aa9b40e3 100644 --- a/examples/offline_inference/llm_engine_example.py +++ b/examples/offline_inference/llm_engine_example.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse from typing import List, Tuple diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index 0c454ea50f6658148a499f508c626601d3a6056d..de0734c1aa83b10270ac39a0cc432d276c88b7ce 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use LoRA with different quantization techniques for offline inference. diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py index 8f0eb65e47f6a9cd527a100445fbdae7023ba542..f227e71ba79bedaf9453280bf5a2c8eaab12b0f0 100644 --- a/examples/offline_inference/mlpspeculator.py +++ b/examples/offline_inference/mlpspeculator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import gc import time from typing import List @@ -49,7 +51,7 @@ if __name__ == "__main__": # Create an LLM with spec decoding llm = LLM( model="meta-llama/Llama-2-13b-chat-hf", - speculative_model="ibm-fms/llama-13b-accelerator", + speculative_model="ibm-ai-platform/llama-13b-accelerator", ) print("With speculation") diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index 043220d979c3c89b9729df41c3ff14aafd23a8ea..630fd1bf834201c2a8bf2f6c9b6ef02fc0055d0c 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use the multi-LoRA functionality for offline inference. diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py index f098c8e5fed1ef844aeb1d1e6f54008f248fbb5d..517d1bfce95d87fca083e183b270708a488fd9cb 100644 --- a/examples/offline_inference/neuron.py +++ b/examples/offline_inference/neuron.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams # Sample prompts. diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py index 8ec17e34009530c59f4a2725707610b875e02c61..c899a01a0bb935a8d1f4fa99b811046d208348b9 100644 --- a/examples/offline_inference/neuron_int8_quantization.py +++ b/examples/offline_inference/neuron_int8_quantization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/pixtral.py b/examples/offline_inference/pixtral.py index c12ff7021cf51245a5d1250f278307167fd6f501..760de114508cd45714b3e8393770dddcdb4b0f3f 100644 --- a/examples/offline_inference/pixtral.py +++ b/examples/offline_inference/pixtral.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa import argparse diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py index 67b755a1559662e047f9e991bd4377de043167ad..4c326c417b4db4c3d04d0cbc074d47a61e5a74d2 100644 --- a/examples/offline_inference/prefix_caching.py +++ b/examples/offline_inference/prefix_caching.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py index 8a94b5c2a8623868bbfa58d4a2bba2c9a3bede16..c2e072fdd8889a2cb377ad099233a33ac4dfcbde 100644 --- a/examples/offline_inference/profiling.py +++ b/examples/offline_inference/profiling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import inspect import json import os diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py index d7423e6c6da93ea7dda536a902838e7d51230f11..b1fe829b3c380800dc2f79aa42f87a1ab257c88f 100644 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import dataclasses import os diff --git a/examples/offline_inference/ray_placement.py b/examples/offline_inference/ray_placement.py new file mode 100644 index 0000000000000000000000000000000000000000..cd801a3c0c858bfc8cfbc3cf20b6d417b202bd1f --- /dev/null +++ b/examples/offline_inference/ray_placement.py @@ -0,0 +1,121 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +a simple demonstration to show how to control +the placement of the vLLM workers with Ray. +The key is to set VLLM_RAY_PER_WORKER_GPUS and +VLLM_RAY_BUNDLE_INDICES properly. +""" +import os + +import ray +from ray.util.placement_group import placement_group +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +from vllm import LLM +from vllm.worker.worker import Worker + + +class MyWorker(Worker): + + def report_device_id(self) -> str: + from vllm.platforms import current_platform + return current_platform.get_device_uuid(self.device.index) + + +class MyLLM(LLM): + + def __init__(self, *args, bundle_indices: list, **kwargs): + # a hack to make the script work. + # stop ray from manipulating CUDA_VISIBLE_DEVICES + # at the top-level + del os.environ["CUDA_VISIBLE_DEVICES"] + # every worker will use 0.4 GPU, so that we can schedule + # 2 instances on the same GPUs. + os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4" + os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join( + map(str, bundle_indices)) + print(f"creating LLM with bundle_indices={bundle_indices}") + super().__init__(*args, **kwargs) + + +class RayTrainingActor: + + def report_device_id(self) -> str: + # the argument for get_device_uuid is the index + # of the GPU in the visible devices. + # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs + from vllm.platforms import current_platform + return current_platform.get_device_uuid(0) + + +# ray manages 4 GPUs +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" +ray.init() + +# we want to co-locate vLLM instance and the training actor +# on the same set of GPUs. +# the placement plan is as follows: +# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2) +# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2) + +pg = placement_group([{"GPU": 1, "CPU": 0}] * 4) +ray.get(pg.ready()) +print(f"placement group has bundles {pg.bundle_specs=}") + +training_actors = [] +training_actor_device_ids = [] +inference_engines = [] +inference_engine_device_ids = [] + +for bundle_index in [0, 1, 2, 3]: + training_actor = ray.remote( + num_cpus=0, + num_gpus=0.4, + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_index, + ), + )(RayTrainingActor).remote() + training_actors.append(training_actor) + device_id = ray.get(training_actor.report_device_id.remote()) + print(f"training actor {bundle_index} is on {device_id}") + training_actor_device_ids.append(device_id) + +for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]): + # IMPORTANT: when creating vLLM instances, we need to + # make sure there are no GPU activities on the target GPUs, + # otherwise, they will interfere with the vLLM memory profiling, + # and cause unexpected behaviors. + llm = ray.remote( + num_cpus=0, + num_gpus=0, + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, + placement_group_capture_child_tasks=True, + ), + )(MyLLM).remote( + model="facebook/opt-125m", + enforce_eager=True, + worker_cls=MyWorker, + tensor_parallel_size=2, + distributed_executor_backend="ray", + gpu_memory_utilization=0.4, + bundle_indices=bundle_indices, + ) + inference_engines.append(llm) + # don't call any method on the inference engine here, + # otherwise it will block until the vLLM instance is created. + +for i, llm in enumerate(inference_engines): + inference_engine_device_ids.append( + ray.get(llm.collective_rpc.remote("report_device_id", args=tuple()))) + print(f"inference engine {i} is on {inference_engine_device_ids[-1]}") + +# check the placement +# the first two training actors should be +# on the same GPUs as the first inference engine +assert training_actor_device_ids[:2] == inference_engine_device_ids[0] +# the last two training actors should be +# on the same GPUs as the second inference engine +assert training_actor_device_ids[2:] == inference_engine_device_ids[1] diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py index 5c4918008dcb3e0aba75dddc72fe7f35ff84fa80..5000251c099f7ed5e9fca715907e45a46befa5e3 100644 --- a/examples/offline_inference/rlhf.py +++ b/examples/offline_inference/rlhf.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ a simple demonstration of RLHF with vLLM, inspired by the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF . diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index 4207f8922403b351db9d5f0153045e9d9b29209b..863276432cb9c307b86ce732709b5e8393031f99 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Saves each worker's model state dict directly to a checkpoint, which enables a fast load path for large tensor-parallel models where each worker only needs to diff --git a/examples/offline_inference/scoring.py b/examples/offline_inference/scoring.py index 5da9e710959b55da34697bd2c04e069e3fd67a7c..7daa82b8277271279c59e55a91cfc5ba399cebd9 100644 --- a/examples/offline_inference/scoring.py +++ b/examples/offline_inference/scoring.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM # Sample prompts. diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py index abcfa8e8f2f2a2bb82ac218d937eb355443f1d75..b45954b3bd54a982447bdb20c5df1b4cb63737a6 100644 --- a/examples/offline_inference/simple_profiling.py +++ b/examples/offline_inference/simple_profiling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import time diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py index 00d864606eeff0fef9ed6b02f83870a4e9d99917..38ffd7fb9903de6653383272b556bfc7cc746280 100644 --- a/examples/offline_inference/structured_outputs.py +++ b/examples/offline_inference/structured_outputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from enum import Enum from pydantic import BaseModel diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py index b6de73eb7266e9a4361e84ce3064531d7bc8da89..35df6011550f2db2e5deac6a9c737000a79eeab2 100644 --- a/examples/offline_inference/torchrun_example.py +++ b/examples/offline_inference/torchrun_example.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ experimental support for tensor-parallel inference with torchrun, see https://github.com/vllm-project/vllm/issues/11400 for diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index 251629b8027ce30c2d51759b350342f5ea8aeee0..bd0e984627d1125b59f20475bdd9124a16288403 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams prompts = [ diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index f0a2d7515fa13e500f3fa0f63116958fe5522c32..436c36570599a13880afa140a37630ea2f87faf8 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models for text generation. @@ -530,18 +531,33 @@ def run_qwen2_vl(question: str, modality: str): return llm, prompt, stop_token_ids -# GLM-4v -def run_glm4v(question: str, modality: str): - assert modality == "image" - model_name = "THUDM/glm-4v-9b" +# Qwen2.5-VL +def run_qwen2_5_vl(question: str, modality: str): - llm = LLM(model=model_name, - max_model_len=2048, - max_num_seqs=2, - trust_remote_code=True, - enforce_eager=True) - prompt = question - stop_token_ids = [151329, 151336, 151338] + model_name = "Qwen/Qwen2.5-VL-3B-Instruct" + + llm = LLM( + model=model_name, + max_model_len=4096, + max_num_seqs=5, + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + "fps": 1, + }, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) + + if modality == "image": + placeholder = "<|image_pad|>" + elif modality == "video": + placeholder = "<|video_pad|>" + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None return llm, prompt, stop_token_ids @@ -571,6 +587,7 @@ model_example_map = { "pixtral_hf": run_pixtral_hf, "qwen_vl": run_qwen_vl, "qwen2_vl": run_qwen2_vl, + "qwen2_5_vl": run_qwen2_5_vl, } diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py index 4ce3d496bf45b2fea18c87a60729602c7243a6a1..3075fbbfa0f363f2f96013de4c09fdcc523475ac 100644 --- a/examples/offline_inference/vision_language_embedding.py +++ b/examples/offline_inference/vision_language_embedding.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models for multimodal embedding. diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 43c44fa867e0aa0e4bbccde1f819445709192258..8d2172a606f8dac32fbc7f95250f1d30345da988 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This example shows how to use vLLM for running offline inference with multi-image input on vision language models for text generation, @@ -391,6 +392,63 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: ) +def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData: + try: + from qwen_vl_utils import process_vision_info + except ModuleNotFoundError: + print('WARNING: `qwen-vl-utils` not installed, input images will not ' + 'be automatically resized. You can enable this functionality by ' + '`pip install qwen-vl-utils`.') + process_vision_info = None + + model_name = "Qwen/Qwen2.5-VL-3B-Instruct" + + llm = LLM( + model=model_name, + max_model_len=32768 if process_vision_info is None else 4096, + max_num_seqs=5, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": + "user", + "content": [ + *placeholders, + { + "type": "text", + "text": question + }, + ], + }] + + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + stop_token_ids = None + + if process_vision_info is None: + image_data = [fetch_image(url) for url in image_urls] + else: + image_data, _ = process_vision_info(messages, + return_video_sample_fps=False) + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=image_data, + chat_template=None, + ) + + model_example_map = { "aria": load_aria, "deepseek_vl_v2": load_deepseek_vl2, @@ -403,6 +461,7 @@ model_example_map = { "pixtral_hf": load_pixtral_hf, "qwen_vl_chat": load_qwen_vl_chat, "qwen2_vl": load_qwen2_vl, + "qwen2_5_vl": load_qwen2_5_vl, } diff --git a/examples/offline_inference/whisper.py b/examples/offline_inference/whisper.py index 087ad4376fb2e940068b3fee042d0a89ec0e8620..59c119a772dabe7186f184915a63b9e1b74443c0 100644 --- a/examples/offline_inference/whisper.py +++ b/examples/offline_inference/whisper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from vllm import LLM, SamplingParams diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py index 49a085febdc57b9a69cbba79aa7be5302bcd16c6..623e0d59a30e34f75d0837b1e6358673ccc62389 100644 --- a/examples/online_serving/api_client.py +++ b/examples/online_serving/api_client.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Example Python client for `vllm.entrypoints.api_server` NOTE: The API server is used only for demonstration and simple performance benchmarks. It is not intended for production use. diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py index a07affe3351ce1fbff8b1f384b692dee50712bfd..fc434ada1d15625ddb8706f37d78ed188b0e08ff 100644 --- a/examples/online_serving/cohere_rerank_client.py +++ b/examples/online_serving/cohere_rerank_client.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Example of using the OpenAI entrypoint's rerank API which is compatible with the Cohere SDK: https://github.com/cohere-ai/cohere-python diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py index 8ceb8f68ea0ce9788c5a96a24b2f608cea2759c0..ee01e1eae6281e6064a5ef19d380080e7cc20d23 100644 --- a/examples/online_serving/gradio_openai_chatbot_webserver.py +++ b/examples/online_serving/gradio_openai_chatbot_webserver.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import gradio as gr diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py index 54e907582986f7b9c451646489d08ef30b72cd63..c619146b03aed7eac1b70a8459592af1047bc86c 100644 --- a/examples/online_serving/gradio_webserver.py +++ b/examples/online_serving/gradio_webserver.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import json diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py index bf4de76ddf3627fbb7a52898a74d1de87efcb033..3e760e1717883d250a63d669307bb33748778c79 100644 --- a/examples/online_serving/jinaai_rerank_client.py +++ b/examples/online_serving/jinaai_rerank_client.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Example of using the OpenAI entrypoint's rerank API which is compatible with Jina and Cohere https://jina.ai/reranker diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py index bbada3891bd199d7b1ff847f9a346518a5a74dea..a81562041130962c2d103a547c6c395d9ce054fb 100644 --- a/examples/online_serving/openai_chat_completion_client.py +++ b/examples/online_serving/openai_chat_completion_client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 03cc037bb6779a560e2ca22debb9072fd3bfae98..d5f798a8dae62efc6185b76067e244ced6901401 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """An example showing how to use vLLM to serve multimodal models and run online serving with OpenAI client. diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py index 2bbe42b6bd2ef0302ed4e1b7a817aa6ab3fa3050..416fb61ca8bb58329c72862553c685e9f41929f7 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Set up this example by starting a vLLM OpenAI-compatible server with tool call options enabled. For example: diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py index 8c059c7ca07ce4c89f608fe496f20df189d1bdab..cddd9318000b25c981bbc356b8d4cc8c9f572830 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from enum import Enum from openai import OpenAI diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py index 83e51a48bcc6b3de76bb7671bd3214a6ce5a59bc..a88c8adb55c28d538c87d73bfd2f546b4b6a7e27 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ An example shows how to generate chat completions from reasoning models like DeepSeekR1. diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index 8c14aac6b4ecbb803090c9d9e96bb15c1c2aa832..489bfcd5ec2a2c2e426fd6d66cb818792557a8e3 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ An example shows how to generate chat completions from reasoning models like DeepSeekR1. diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py index a56e7429b7567e2526776c9e45dfa582a20ae256..f49d7a228191c7f98ab70727a95f46496a4994ed 100644 --- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import base64 import io diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py index 58519f978d340a1af85581f478d2729999e5d26f..06b93d7d193154c9c1217aaa8cc17f334d14787e 100644 --- a/examples/online_serving/openai_completion_client.py +++ b/examples/online_serving/openai_completion_client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py index 365a684d53f2b332d1281a7c3419bb363d7a98f9..67c5fc91bc65bb1ce80875347e57ddddb7302011 100644 --- a/examples/online_serving/openai_cross_encoder_score.py +++ b/examples/online_serving/openai_cross_encoder_score.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Example online usage of Score API. diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py index 4bd7ca01d750df2431f558b7e7ad1a7d3f37edf6..cb110997464ac41782612455b70a2d16ef021eb5 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/openai_embedding_client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from openai import OpenAI # Modify OpenAI's API key and API base to use vLLM's API server. diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py index 37ec8f2fb6be377272d74c673a9f52ed6fcb5543..e17f9c5efd65907ba0cf3c070daa0875d54f3059 100644 --- a/examples/online_serving/openai_pooling_client.py +++ b/examples/online_serving/openai_pooling_client.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Example online usage of Pooling API. diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py index b1a2b3c3c4aaf521e89f16f9b3500414ab7f15c5..7a605f85b97fe0ea00aeb494ecd4538b3af1da87 100644 --- a/examples/online_serving/opentelemetry/dummy_client.py +++ b/examples/online_serving/opentelemetry/dummy_client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import requests from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( OTLPSpanExporter) diff --git a/examples/other/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py index 5fff1fdf502c93bdb50d0568142de831b1c3b304..68345e6cb98d98a8ad94202cfbd161be280cd510 100644 --- a/examples/other/tensorize_vllm_model.py +++ b/examples/other/tensorize_vllm_model.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import dataclasses import json diff --git a/examples/template_pixtral_hf.jinja b/examples/template_pixtral_hf.jinja deleted file mode 100644 index e94661cb390710ee48217c0326d71c65c83b405e..0000000000000000000000000000000000000000 --- a/examples/template_pixtral_hf.jinja +++ /dev/null @@ -1,38 +0,0 @@ -{%- if messages[0]["role"] == "system" %} - {%- set system_message = messages[0]["content"] %} - {%- set loop_messages = messages[1:] %} -{%- else %} - {%- set loop_messages = messages %} -{%- endif %} - -{{- bos_token }} -{%- for message in loop_messages %} - {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} - {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }} - {%- endif %} - {%- if message["role"] == "user" %} - {%- if loop.last and system_message is defined %} - {{- "[INST]" + system_message + "\n" }} - {%- else %} - {{- "[INST]" }} - {%- endif %} - {%- if message["content"] is not string %} - {%- for chunk in message["content"] %} - {%- if chunk["type"] == "text" %} - {{- chunk["text"] }} - {%- elif chunk["type"] == "image" %} - {{- "[IMG]" }} - {%- else %} - {{- raise_exception("Unrecognized content type!") }} - {%- endif %} - {%- endfor %} - {%- else %} - {{- message["content"] }} - {%- endif %} - {{- "[/INST]" }} - {%- elif message["role"] == "assistant" %} - {{- message["content"] + eos_token}} - {%- else %} - {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }} - {%- endif %} -{%- endfor %} diff --git a/find_cuda_init.py b/find_cuda_init.py index 51db23102f9aceea8db6bb1e65b2222544ba379a..0d13b2f862102718acf362f9df729c61bb277220 100644 --- a/find_cuda_init.py +++ b/find_cuda_init.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import importlib import traceback from typing import Callable diff --git a/python_only_dev.py b/python_only_dev.py index 7d95ac96e6e4b3d6f9398fe9255b083f7797a056..a303697b780a6c6c5fb5e15779fa0652ff1b78dd 100644 --- a/python_only_dev.py +++ b/python_only_dev.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation). TL;DR: diff --git a/requirements-common.txt b/requirements-common.txt index e5248572ce4d40e88048a7648c2f186dead0e585..cfa02025629f253752ca264e8f086a18c36597ec 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -5,7 +5,7 @@ requests >= 2.26.0 tqdm blake3 py-cpuinfo -transformers >= 4.48.2 # Required for Bamba. +transformers >= 4.48.2 # Required for Bamba model and Transformers backend. tokenizers >= 0.19.1 # Required for Llama 3. protobuf # Required by LlamaTokenizer. fastapi >= 0.107.0, < 0.113.0; python_version < '3.9' @@ -34,6 +34,6 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.9.0 # required for compressed-tensors +compressed-tensors == 0.9.1 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py diff --git a/setup.py b/setup.py index 6a833d7c78353a5f8d39765697938f5800f6f803..c0ac0d3a096a3a172680d637fa5c1f74cd8a0aee 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ctypes import importlib.util import logging @@ -503,9 +505,9 @@ def get_version_add(sha: Optional[str] = None) -> str: new_version_content = f""" try: - __version__ = "0.7.1" - __version_tuple__ = (0, 7, 1) - __hcu_version__ = f'0.7.1+{version}' + __version__ = "0.7.2" + __version_tuple__ = (0, 7, 2) + __hcu_version__ = f'0.7.2+{version}' from vllm.version import __version__, __version_tuple__, __hcu_version__ except Exception as e: @@ -627,7 +629,7 @@ def get_requirements() -> List[str]: return resolved_requirements if _no_device(): - requirements = _read_requirements("requirements-cpu.txt") + requirements = _read_requirements("requirements-common.txt") elif _is_cuda(): requirements = _read_requirements("requirements-cuda.txt") cuda_major, cuda_minor = torch.version.cuda.split(".") diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index a3c9d5c6e08984f328f4f29cfb3be0166d52943c..d9ac611644df89037e9a13be79ec80938ef86303 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """vllm.entrypoints.api_server with some extra logging for testing.""" from typing import Any, Dict, Iterable diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 864e75229cc983698fa56082d8f907f15be05eeb..5e0bf67ba8713a0976b72fb98c99e2fe37fcf984 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import subprocess import sys import time diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index db14ca6e6020bcd06387508f430ef6d1935bcefc..7ed4f7826ccb497433bd359c082d32139bc32754 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os import uuid diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py index 5668cc30d32c35f9f44f2edd759e579959724371..fd6d89d4e00de91a21e67ed24dae207106e5c189 100644 --- a/tests/async_engine/test_request_tracker.py +++ b/tests/async_engine/test_request_tracker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.engine.async_llm_engine import RequestTracker diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index ea971c257d05f446475edd3755978facc0beabc3..c0243da34738f0f234edcca4e9b43113f2ea71bf 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -1,18 +1,15 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the short outputs of HF and vLLM when using greedy sampling. Run `pytest tests/basic_correctness/test_basic_correctness.py`. """ import os -import pickle -import re import weakref -from unittest.mock import patch import pytest from vllm import LLM from vllm.platforms import current_platform -from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata from ..conftest import VllmRunner from ..models.utils import check_outputs_equal @@ -147,63 +144,9 @@ def test_models( # with hf_runner(model, dtype=dtype) as hf_model: # hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) -# check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.skip_v1 -def test_model_with_failure(vllm_runner) -> None: - try: - with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward", - side_effect=ValueError()): - with pytest.raises(ValueError) as exc_info: - vllm_runner(os.path.join(models_path_prefix, "facebook/opt-125m"), - dtype="half", - enforce_eager=False, - gpu_memory_utilization=0.7) - matches = re.search(r"input dumped to (.+).pkl", - str(exc_info.value)) - assert matches is not None - filename = f"{matches.group(1)}.pkl" - - with open(filename, "rb") as filep: - inputs = pickle.load(filep) - - if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")): - raise AssertionError("Missing keys in dumped inputs. Dumped keys: " - f"{list(inputs.keys())}") - assert isinstance(inputs["arg_1"], - ModelInputForGPUWithSamplingMetadata) - finally: - os.remove(filename) - - -@pytest.mark.skip_v1 -def test_failure_with_async_out_proc(vllm_runner) -> None: - - filename = None - try: - with vllm_runner("facebook/opt-125m", - dtype="half", - enforce_eager=False, - gpu_memory_utilization=0.7) as vllm_model,\ - patch("vllm.model_executor.models.opt.OPTForCausalLM.forward", - side_effect=ValueError()): - model_config = vllm_model.model.llm_engine.model_config - assert model_config.use_async_output_proc - with pytest.raises(ValueError) as exc_info: - vllm_model.generate_greedy('how to make pizza?', 250) - matches = re.search(r"input dumped to (.+).pkl", - str(exc_info.value)) - assert matches is not None - - filename = f"{matches.group(1)}.pkl" - finally: - # Clean up - if filename is not None: - os.remove(filename) - pass + # check_outputs_equal( + # outputs_0_lst=hf_outputs, + # outputs_1_lst=vllm_outputs, + # name_0="hf", + # name_1="vllm", + # ) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index fda8a2bc8112fffbf87b2c3bdb49b11e69b42b8d..28e1ba350eedba3143fc8cd9d8a3a0c2a6c81d31 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM when using greedy sampling. It tests chunked prefill. Chunked prefill can be enabled by diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index 04d61cc7637fbe146666d52684665f1747192a2e..1e1093487d2c4103a9723fdb8cd2d770b8b47166 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from ..utils import compare_two_settings, models_path_prefix diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index 53f4ef08f36a2e82c4bd139237cfd492f14d41ea..da9239b0940764cb948bdd5ed25eb5c52086356a 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from vllm import LLM, SamplingParams diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 8be9dccb4d6c1e1dc7dd86a4214a8e95ed32ddc7..3ad3877efc35df9209705eb40028ea50c4b455c5 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the short outputs of HF and vLLM when using greedy sampling. VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test. diff --git a/tests/compile/backend.py b/tests/compile/backend.py index 8fa10e5bd1b3755febf92d1c66898d426ecb2a4a..74bc58a2dd542b9c53554145cc869098f85b73a1 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from copy import deepcopy from typing import Callable, Union diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index aa11524812cdd3940e08ba3eb8cccc15fd90ea87..9d633ad259b13520d4f958c17c91109a40749be0 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Test the piecewise compilation with a simple model so that we can exactly calculate the expected output and side effects. diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index d4ede4d2320a7c7e3d8f5450a1891534170134c7..0404722bab8917863438792fd21762df7b3b9ad1 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Test the piecewise compilation with a simple model, comparing the output with and without the piecewise compilation. diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 8622c4bf03442c1483edeb2282ec4b76ebde112f..268163d11f82ba3167a29cdeda91dd95298b9f24 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import Dict, List, Optional diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 4dfdfe21a67dff15bb5b53a9c69ce033257a8f24..6e83fa36881e4a7b3997c8b3b08d665c2ecbe911 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.config import CompilationLevel diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 8ad436296b2b7a04cbd78bf6df0b21b464e7421d..adcf2ef1fcfd1e5473673d1cbb9ad086e68c6ddb 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest import torch diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index b4266a4a7db943ca8bf4ddf9ae8397ea7d85994f..c14f0caab5399b9aa0abdfe71bbcf72d7dd647b3 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch from compressed_tensors.quantization import FP8_DTYPE diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index 03e7535093c5d2ba67cb24ebbb23e92c5a0539b1..70920ab10ec2f645764e69fbc6ed2e703fa77906 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle import pytest diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index 74f66baaa5ea1536c2d5196d351dd65c49cc1cc3..0934c61135792f8f6b0bff187a69ba6916965937 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional import torch diff --git a/tests/compile/utils.py b/tests/compile/utils.py index 48dfe72756a3434c33ddcd4a7cbe073cfb127f31..ed9a8652964fba458b53cc1e75bceed16c82185f 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import torch diff --git a/tests/conftest.py b/tests/conftest.py index 19161a74f72e69c3b43d2c727dfedd516cbc802f..15987bc03d0ec8e18d6853e1c561e3f0861c8466 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import os import tempfile @@ -738,6 +740,7 @@ class VllmRunner: images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, + **kwargs: Any, ) -> List[Tuple[List[List[int]], List[str]]]: inputs = self.get_inputs(prompts, images=images, @@ -745,7 +748,8 @@ class VllmRunner: audios=audios) req_outputs = self.model.generate(inputs, - sampling_params=sampling_params) + sampling_params=sampling_params, + **kwargs) outputs: List[Tuple[List[List[int]], List[str]]] = [] for req_output in req_outputs: @@ -783,6 +787,7 @@ class VllmRunner: images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, videos: Optional[PromptVideoInput] = None, + **kwargs: Any, ) -> Union[List[TokensTextLogprobs], List[TokensTextLogprobsPromptLogprobs]]: inputs = self.get_inputs(prompts, @@ -791,7 +796,8 @@ class VllmRunner: audios=audios) req_outputs = self.model.generate(inputs, - sampling_params=sampling_params) + sampling_params=sampling_params, + **kwargs) toks_str_logsprobs_prompt_logprobs = ( self._final_steps_generate_w_logprobs(req_outputs)) @@ -827,13 +833,15 @@ class VllmRunner: images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, + **kwargs: Any, ) -> List[Tuple[List[int], str]]: greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) outputs = self.generate(prompts, greedy_params, images=images, videos=videos, - audios=audios) + audios=audios, + **kwargs) return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs] @@ -848,6 +856,7 @@ class VllmRunner: videos: Optional[PromptVideoInput] = None, stop_token_ids: Optional[List[int]] = None, stop: Optional[List[str]] = None, + **kwargs: Any, ) -> Union[List[TokensTextLogprobs], List[TokensTextLogprobsPromptLogprobs]]: greedy_logprobs_params = SamplingParams( @@ -862,7 +871,8 @@ class VllmRunner: greedy_logprobs_params, images=images, audios=audios, - videos=videos) + videos=videos, + **kwargs) def generate_encoder_decoder_greedy_logprobs( self, diff --git a/tests/core/block/conftest.py b/tests/core/block/conftest.py index 0464d6a74da6107b43cf2eaaef83265d3476018e..b7a9863f4aaf58945e90d7fe70dde805f1666abf 100644 --- a/tests/core/block/conftest.py +++ b/tests/core/block/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py index 70577ec052a2cd8ca046dff79382ff7a104228e1..7d3ccaadaca19c13deef42ca7bcaa86ecbd48305 100644 --- a/tests/core/block/e2e/conftest.py +++ b/tests/core/block/e2e/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, Iterable, Optional import pytest diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 5dd202aca37748b02c5fd14178881aac88935d8c..d44bc5617bc7e69a95e6116f8b63cdfacc9e929c 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import cycle import pytest diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index 4eadeb3c7ebca0ffa4a39ce2910796bbd3f771ca..4ec96f87f6aeac2cd43b3465a9fa88799cd9c5e0 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import List diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py index cfd749ad586944911f27b191db60cfb10bb158e1..68d9618ae245be712afeb996deaf1385c05d7c6e 100644 --- a/tests/core/block/test_block_manager.py +++ b/tests/core/block/test_block_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py index e2391a5680b3637f4541ad05e147031c9d73faef..d8cf0bec709ac1c32c9df2d793a3ae930429f4cd 100644 --- a/tests/core/block/test_block_table.py +++ b/tests/core/block/test_block_table.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py index cfdd3582ed2efdf3ed3ad2e3665d0675504e2463..20260873003df3ae939d8f4c7e85798f89cb0b27 100644 --- a/tests/core/block/test_common.py +++ b/tests/core/block/test_common.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random import pytest diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py index a9e38d40444a9097712d8046472cce07e7890d06..a1414edd95622c2acca84cc400e2d64ca17157a0 100644 --- a/tests/core/block/test_cpu_gpu_block_allocator.py +++ b/tests/core/block/test_cpu_gpu_block_allocator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py index 10d5964dcfe8a4608dc7e3dc3a03dff5a718cae1..0ca2a0b8054d87fa97f544a9fa55f65a4158a87c 100644 --- a/tests/core/block/test_naive_block.py +++ b/tests/core/block/test_naive_block.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional import pytest diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 6642174c17d8b20ad8441510dbef5871e09a6bcd..bf40b334abc56a9b06f51f650c00d8bdc3f79952 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math import random from typing import List, Optional @@ -63,8 +65,8 @@ class TestPrefixCachingBlock: previous_block = MagicMock(spec=PrefixCachingBlock) prev_block_hash = random.randint(0, 1000) - previous_block.content_hash = (prev_block_hash - if prev_block_has_hash else None) + previous_block.content_hash = (prev_block_hash if prev_block_has_hash + else hash('None')) num_to_fill = block_size if is_curr_block_full else random.randint( 0, block_size - 1) diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index eaaf004df38b287cea720433c663e18535768ce5..8da25aea457dc5a7240e2a141ef3d896741df06c 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List from unittest.mock import MagicMock diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py index 49982084e4e768b37026c7057d8fa9a618c66696..ca757b07472da6a5b12566a74ebb25c693ee0039 100644 --- a/tests/core/test_num_computed_tokens_update.py +++ b/tests/core/test_num_computed_tokens_update.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 8f6de84e566e70705cd86cc3378ffcaca4e5c701..dcc97ebaa7c56c024fcdb92d323cf816354e2540 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from collections import deque from typing import List, Set, Tuple diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py index 16bea54936bc879c816d730a3acfbb0d5a2b4f29..a4e3c73a5a7bb3c3d92343654b36c6ebdf4ec9dd 100644 --- a/tests/core/test_scheduler_encoder_decoder.py +++ b/tests/core/test_scheduler_encoder_decoder.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest # noqa diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py index d604e5250a3f95c4b353b9cc90b49fb68379da37..64b3e148ee728c08da3479e850cdf20e79a8fe08 100644 --- a/tests/core/test_serialization.py +++ b/tests/core/test_serialization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import msgspec from vllm.executor.msgspec_utils import decode_hook, encode_hook diff --git a/tests/core/utils.py b/tests/core/utils.py index 16703cd19fa1e80d9b4c9bfa79b7f8436674fa90..fb77dccce1c9daf2d5c76903b26e1331782d8b4d 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from collections import defaultdict from typing import Any, Dict, List, Optional diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py index fc4043cd3014e749703f9f8c6b1387f23dec888a..72e7ebdb7b59478b8d23f9cae6b01d436e8ca819 100644 --- a/tests/distributed/test_ca_buffer_sharing.py +++ b/tests/distributed/test_ca_buffer_sharing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # can only run on machines with p2p access across GPUs # can only run with torchrun: # torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index d01f187521fe61f50a0b49ce32bb5b63c8b39f0a..bc916e8de07c4c3c31d5ea397b920f52a8f5db4c 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test the communication operators. Run `pytest tests/distributed/test_comm_ops.py`. diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 4072616fd30e281fe9e9791430f1fec93fa9d846..46887bca42a90f7d421433ec6af9608cfe97fd01 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import random diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py index 62e77a2f77597585b677c4b1d55e45d4bcd8372c..4b0c65d1d3a47fa66c65bd3604a9dd8ca510a0e8 100644 --- a/tests/distributed/test_distributed_oot.py +++ b/tests/distributed/test_distributed_oot.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from ..entrypoints.openai.test_oot_registration import ( run_and_test_dummy_opt_api_server) diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py index 9f9c0ff07ee37c540f00ae7f417481d3ae0f881f..c86d2d8a0061a415717a2eeae358f86ac93b4bf8 100644 --- a/tests/distributed/test_multi_node_assignment.py +++ b/tests/distributed/test_multi_node_assignment.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Make sure ray assigns GPU workers to the correct node. Run: diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 6a4fa020631472ee57da9fa5b76826a199d2a395..fcf4d30ccd28cb5982b419f757f7b57bad5f51d0 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ WARNING: This test runs in both single-node (4 GPUs) and multi-node (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py index 2d4d07dd27522070ad18831cb2851ba814b8f7d5..3ed104820b4765fe03c9af73d840d1ada79de7dc 100644 --- a/tests/distributed/test_pipeline_partition.py +++ b/tests/distributed/test_pipeline_partition.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py index 1cd0550746f7d2c1b323516de6a50bcf12ffc556..f534a39c9b0c4f6a2d087efb469a506159ae2869 100644 --- a/tests/distributed/test_pp_cudagraph.py +++ b/tests/distributed/test_pp_cudagraph.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index a8571a11578927383d1a0819628c13370ac12138..4c42a0ed811253fb004ae163fb2de79d08817a7c 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import multiprocessing import os from typing import Dict, List diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py index 62311a626bc470e5402531bd2447a52e8f18d046..9b1bbd6e545c1b505a1a2db6ff0394afb345e591 100644 --- a/tests/distributed/test_same_node.py +++ b/tests/distributed/test_same_node.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import torch.distributed as dist diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py index 723872682cf9780ca0f3bb6830cc5ed813d60fdf..59fa7cc9f319b1eb2fea4b3ed4ec5d5178ae7599 100644 --- a/tests/distributed/test_shm_broadcast.py +++ b/tests/distributed/test_shm_broadcast.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import multiprocessing import random import time diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py index 7aa03d7f0402a848625a4f7f3531dfb7565b1e46..a092a548a59c6e2a53ceaaa66c0c6c7f775850d2 100644 --- a/tests/distributed/test_torchrun_example.py +++ b/tests/distributed/test_torchrun_example.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # unit test for `examples/offline_inference/torchrun_example.py` import random diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 5fb1ae7b29fd2af59e3d99d0047523d6dc3f0fab..4432950f274e023cb5173bc4ca016785f0850c4d 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import socket import pytest diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index a90763619ec26eb9ad8d3777639d209fb94d2879..451a1c860ec8b84ff453147b64ae7abb177a7c82 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """E2E tests to verify the correctness of the encoder-decoder framework Run `pytest tests/encoder_decoder/test_e2e_correctness.py`. diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py index 88f3fad4c79f8787e0034e8f061604cb29dd5d96..3ba3c4ec53a5e54275d20b077c55484d5002c93c 100644 --- a/tests/engine/output_processor/test_multi_step.py +++ b/tests/engine/output_processor/test_multi_step.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from unittest.mock import MagicMock diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py index cc14e8cbf75dfb8f6c7fe7b13eda102fd2d97185..e9ad8d1612102b85ad7160a524363493929d3063 100644 --- a/tests/engine/output_processor/test_stop_checker.py +++ b/tests/engine/output_processor/test_stop_checker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import MagicMock import pytest diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 4e269de9fc40b5b225b4dcb1ba1c2781648f8899..8698d124e73ffc063ac450895511fd61b369c4b8 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from argparse import ArgumentTypeError import pytest diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py index c045fb12c98812185916a4d4a66282e492882ce2..ce19f5d45d6507cf1ad2f2cbe380a983d96dc674 100644 --- a/tests/engine/test_computed_prefix_blocks.py +++ b/tests/engine/test_computed_prefix_blocks.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.engine.arg_utils import EngineArgs diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py index 9380b834882ad1e03f2ba1264e6bb2663a742b5d..d6526ff4f5ac26f090e7fe95a59e45feda93a9a3 100644 --- a/tests/engine/test_custom_executor.py +++ b/tests/engine/test_custom_executor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os from typing import Any, Callable, Dict, List, Optional, Tuple, Union diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py index 57d699f51b4aceece8f602f91dbbaafc9544583d..f19e560b1f1a180a786d3f3b23e9ebc46092e44c 100644 --- a/tests/engine/test_detokenization.py +++ b/tests/engine/test_detokenization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.entrypoints.llm import LLM diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index 04505fcaae24b55d5ab1ae059ec1537b4bae09a5..f1fe58e35a32eaf5686db0ea0a32e141389ff0ba 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from concurrent.futures import ThreadPoolExecutor from functools import partial diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py index 80b21a2bff4da70d9667b0a47e837da56b04a355..deb5ae59b5917e75b2b2928ecdde71605639f350 100644 --- a/tests/engine/test_short_mm_context.py +++ b/tests/engine/test_short_mm_context.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from ..conftest import IMAGE_ASSETS diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py index 3f023dbe42d58b47b339abf10ebc7cd2c8e4f7b0..e304e2792daac71b36e3f3d82457e6c1191b5746 100644 --- a/tests/engine/test_skip_tokenizer_init.py +++ b/tests/engine/test_skip_tokenizer_init.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.entrypoints.llm import LLM diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py index c2a55bd1d059444e4e4b498970c88bbf222c7cb9..722634a60eb24410352afff345e87b700e0486d5 100644 --- a/tests/engine/test_stop_reason.py +++ b/tests/engine/test_stop_reason.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test the different finish_reason="stop" situations during generation: 1. One of the provided stop strings 2. One of the provided stop tokens diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py index 27414813b635465f534aea48788038ecc304bb8f..d6c45bd5a9561b6f1553f87ed7ec9112f8f7d979 100644 --- a/tests/engine/test_stop_strings.py +++ b/tests/engine/test_stop_strings.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, List, Optional import pytest diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index ef74062ce4b41e2878896e9eb0deb075d8ee129c..b00e168db9d325bcd8f2998f48d736fc13e32d59 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py index 5ec12b98f09788a52aeba832c5f11dc86052f85e..f6623a9b94ae9096cfe66d9e3457db9d0fc1ab32 100644 --- a/tests/entrypoints/llm/test_accuracy.py +++ b/tests/entrypoints/llm/test_accuracy.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This file test accuracy of the vLLM server via LMEval. It uses local-completions, which interacts with vLLM diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index ab99cccddda42fea8f6af60c596d7b95e526b3d1..6b522ce1da6d6fe14c09722c6cdb81636f7510ed 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import os diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py index 22473ce275295540c8c61df5641f40a6e4a38544..39d4810de9e7b7b598ba10244a5172afeac2dd74 100644 --- a/tests/entrypoints/llm/test_collective_rpc.py +++ b/tests/entrypoints/llm/test_collective_rpc.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm import LLM diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index 4209f1dca6a2002200a482399216046c06ae8ceb..2deb5fc390b8681b0cf0a80c81a4c9e2576a01c1 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import weakref from typing import List diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 5283785303ce35c8874d33eb73740de8232ce560..74297101a086f6ad7712666b4bd241b15d91ec4b 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import weakref from typing import List import os diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py index 33f975be6d28aff23ada8cd74592ec8ce4df60ed..24a5dad99b6b0a459596b8907c58c8823a237832 100644 --- a/tests/entrypoints/llm/test_generate_multiple_loras.py +++ b/tests/entrypoints/llm/test_generate_multiple_loras.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import weakref import pytest diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py index 3cfa17f0b9701fdfa47b0224feed815738b6a748..3bd9aa58f842ffc193f6710d489cb755773956f5 100644 --- a/tests/entrypoints/llm/test_gpu_utilization.py +++ b/tests/entrypoints/llm/test_gpu_utilization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from vllm import LLM, SamplingParams from ...utils import models_path_prefix diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index a710e9682355bdd017163225a91b8595d0b32151..9465023aca1d3c33f135596497568ff76962f36f 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import re import weakref diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py index 6a22b6a0a87eb2040deb7bf128d0cddb105e7864..72e84b953660c48a1a4cee846975d86e866d3845 100644 --- a/tests/entrypoints/llm/test_init.py +++ b/tests/entrypoints/llm/test_init.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index e57ee204837c5f0bf36c1654f6d5af52ffbef28f..6ed693610f0cbb247d4fc838dbcbaf7438809e21 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import sys import os from contextlib import nullcontext diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index aaba8a81dea41c0531eebc556c5623ca28ec31e1..0d972d2a4311fccfaae109e896726b3d4b2af374 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import os diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index 18a761d329128505cb6195de05a55fa9b5a6e20b..815fe12c812dd14ba2b7a65dbd67999e7b3d991c 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for HF_HUB_OFFLINE mode""" import importlib import sys diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py index 4607e4dfe4d0bfdf6744d49926b53aff843e1343..f7b81be48bd11ac5d110ab6b15295494d4c5069c 100644 --- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py +++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/entrypoints/openai/reasoning_parsers/utils.py index ac73ad50a7395594f5d7c0dada60cfa9edf55e00..2157e059594b462353d6274cdcd5cccf2c87388d 100644 --- a/tests/entrypoints/openai/reasoning_parsers/utils.py +++ b/tests/entrypoints/openai/reasoning_parsers/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple, Union from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/test_accuracy.py index edb8faeb017d305674de2040f383ce6c5ec0c934..89c26e3a471d87d206f63898c5c764cb60940fa8 100644 --- a/tests/entrypoints/openai/test_accuracy.py +++ b/tests/entrypoints/openai/test_accuracy.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This file test accuracy of the vLLM server via LMEval. It uses local-completions, which interacts with vLLM diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py index b9bd0c6161b999a312e1801a9b15a7e0c060ad76..ba98f18c6e372dcb70d094ca52db6ad47ff1ad1f 100644 --- a/tests/entrypoints/openai/test_async_tokenization.py +++ b/tests/entrypoints/openai/test_async_tokenization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import contextlib import random diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index cca06b60a003e3fbda711dc4ee7721bedef6be98..07c5c0d6eb0dcc690a70f29b6b67080993877247 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List import openai diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index 621b70c5361837ce866cadbcef51ebb3c8490829..213551e3866ad73e945f13702fe6796315ad2556 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from http import HTTPStatus from typing import List diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 8761b389233ff9dfb2d5d49e73e7cf798f3ad7d5..0eb2ba669310fbc4de4f814bb92dc92dc0a2ed0e 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # imports for guided decoding tests import json import re diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py index 26c750771aa453ac343ea5e3eb2b5000facec144..6a1ca009ed4e8fd25a13de68ea8f8e28d562118f 100644 --- a/tests/entrypoints/openai/test_chat_echo.py +++ b/tests/entrypoints/openai/test_chat_echo.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import NamedTuple import os diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index 927b303bf0ce4e922bd9d7a5f948f19daa923d06..73993e3519e4577c077e8b109717a6bac2a21c01 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import os diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py index 8e2f4477e61c86631fd0391bb144eef9bde43c75..392688a3094209ee42e0ec782588ff8cb650adfb 100644 --- a/tests/entrypoints/openai/test_chunked_prompt.py +++ b/tests/entrypoints/openai/test_chunked_prompt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import openai # use the official client for correctness check import pytest diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 01bcd78aa91a8a601443509381093f2110e7b7e5..2f065ec1070e66a38296f9732a526429e29bfc85 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import pytest diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 46fe1bdde3e1fdddb5963e222b64d933a092a1fd..36d9239f36326a6b74f88f4ea2f257ae4c7fc66a 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # imports for guided decoding tests import json import re diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 32b4ce5c4579fd8d5fca219970826aaba2e865a7..1c6b9a6e7d79064d4391ab57d0bdb3ba04fb887b 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import base64 import numpy as np diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py index 8d4c67ac4722e679098331c899aab34b7f118d48..db4a037f33d8330fb045bb3169767d96b57ec27a 100644 --- a/tests/entrypoints/openai/test_encoder_decoder.py +++ b/tests/entrypoints/openai/test_encoder_decoder.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import openai import pytest import os diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 6ff99f6faa14364114d4e3c1543c7d4adbafcec5..1a62157acc478f978903befdbfb55d5637f4b3ba 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import json import shutil diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 36349b0ba6b2c1e6ba2156c6b148fb15151ba654..8d4bbd0cd1c8d94e169f59b6ac1c038cd57d1543 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import subprocess import sys import tempfile @@ -204,6 +206,7 @@ EXPECTED_METRICS_V1 = [ "vllm:gpu_cache_usage_perc", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", + "vllm:request_success_total", "vllm:request_prompt_tokens_sum", "vllm:request_prompt_tokens_bucket", "vllm:request_prompt_tokens_count", diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py index bcbafb3af9b8d24d0912299890b1585446ad7633..0bc9d736986271fc7edefd7afe88dd420e09dda4 100644 --- a/tests/entrypoints/openai/test_models.py +++ b/tests/entrypoints/openai/test_models.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import openai # use the official client for correctness check import pytest import os diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py index fb3bc66c4b6b6537bf37ff23495753076391b8b4..1732f1e1ef6cae0de52e37e261445cfa59b4c8f9 100644 --- a/tests/entrypoints/openai/test_oot_registration.py +++ b/tests/entrypoints/openai/test_oot_registration.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from ...utils import VLLM_PATH, RemoteOpenAIServer, models_path_prefix, envs diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py index 9c49239398cd2eb07a1bc95797dc5435f9ba2b8e..11d3bfafab1cc6411d0ce4ef984d1d80e9551790 100644 --- a/tests/entrypoints/openai/test_pooling.py +++ b/tests/entrypoints/openai/test_pooling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import base64 import numpy as np diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index 5205dcc9f6440aaa040e60fda46a5067ebaaac3f..693fb8ddbc566235c8bcd50f98e1cf41bac3611b 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # imports for guided decoding tests import re import os diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py index cfd8f331339604c7da281cd5a7568800426ff3f2..4c9774a7397defedd455a30e2c7922caffe112e1 100644 --- a/tests/entrypoints/openai/test_rerank.py +++ b/tests/entrypoints/openai/test_rerank.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import requests diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index 99f6da160d6f9e86f50ffdc14aa02267ec6102f5..9b33eddae2a83736d1f2738860a1436bd1d78dc2 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Separate these tests out from test_completion and test_chat, because they # require launching a second server with a different flag. Running both servers # at the same time on a single node will OOM. diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py index ab9fe4d37e04fa1282981a1dfb9f1d2cb127baf9..2410c87de229b1ac30b1280f07cb77eb96d4bc2b 100644 --- a/tests/entrypoints/openai/test_root_path.py +++ b/tests/entrypoints/openai/test_root_path.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import contextlib import os from typing import Any, List, NamedTuple diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index f352dcdf82c5e239d5ed984852d168249caf6afb..99a5fedad49fbce85bda473ff3be2c56c8011fd1 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import subprocess import sys diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index 29c62398ff5009905f97b3a1d49be47974e7e999..b196d475872f9069bd9e688c62e857a4b219fd93 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest import requests diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 52d56f0e0c010b21cb4ed371440cf337ae52e4ad..5b66f1a98429c57a3437d4bb091249c2a59a3ee6 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from contextlib import suppress from dataclasses import dataclass diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index 3bfa4e0de074bace8176f14dc82a1c6b0b2a2ad3..a3dc0b1ee08fffeec3a980ef2518237a2af6eb1d 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from http import HTTPStatus from unittest.mock import MagicMock diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index 57f0fdd1a3c9e662f0c8ff07244661b162f077de..58b39196190f54ae74aa15e2017bbdd59c5e0948 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import openai import pytest diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index f81dad36a0b67e7c94619180ad23b8afb3529cf1..1109d3d30023c66c6feaa5c91b9f35cf1be20d8e 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import os import pytest_asyncio diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index 24710faa6462c4a8f6ff0eaaa893159c5a5f4c89..06b04993b8a33254d764841f709d0390928b6abd 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List import os diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 9e91827b439e2736b77929285e7a0020a08a761c..06214ac570ea3e5e7a2568edcbf361eb6b6ef46d 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List import openai diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 50f2ab97412cf2489f0d392d8ccec54ea186dbba..23b71d0b3dba932f69a41bbaebe01a22be6c11f6 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict import os diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py index 47b0b6bb80ffe6d21f6b1dd19d910b03472acb99..788efa86b1093c5675a73b746e687d265f6ef6e9 100644 --- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List from unittest.mock import MagicMock diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py index f0a2a32c167863e9db62e598d58a83cb45ef4f9b..57ec9865355d9e291ccde38a6ca429e0a2f4947b 100644 --- a/tests/entrypoints/openai/tool_parsers/utils.py +++ b/tests/entrypoints/openai/tool_parsers/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Iterable, List, Tuple, Union from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 043185a007d3f7b6f8d78e00f08cd8c9caedb87c..f338b5fee12ec0f70724024417b157d2e66b0aa9 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import warnings from typing import Optional @@ -761,7 +763,6 @@ def test_resolve_content_format_hf_defined(model, expected_format): ("template_falcon.jinja", "string"), ("template_inkbot.jinja", "string"), ("template_llava.jinja", "string"), - ("template_pixtral_hf.jinja", "openai"), ("template_vlm2vec.jinja", "openai"), ("tool_chat_template_granite_20b_fc.jinja", "string"), ("tool_chat_template_hermes.jinja", "string"), diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py index 175cfe82fb74e5b573c27f708a036810f2188c8d..97ceffab4eb88af55eb5ead21c4efca28a6441f0 100644 --- a/tests/kernels/allclose_default.py +++ b/tests/kernels/allclose_default.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch # Reference default values of atol and rtol are from diff --git a/tests/kernels/conftest.py b/tests/kernels/conftest.py index 4f2f9cc3dac7db974382d1d679a925a978684482..4f04ec94753297d80e1aaddf0db5054fce8ad139 100644 --- a/tests/kernels/conftest.py +++ b/tests/kernels/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.utils import (create_kv_caches_with_random, diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index f2358940fc7b8c94883e24503ea0fb90f95c5362..34dcf91c766643991e88b84ff2c8cf1460566842 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple, Union import torch diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index dac26efe866b8b9cbcb92462a261b2093bd61f25..2e70b1db35c4538751476e0e517ff449c25f0fa7 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import Type diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 351f79429ab5425c27dcc309e92fbdcb1d1d33fd..f78f6e1e0c1f8a3de4e5338ce24e0f320927fc53 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import List, Optional, Tuple diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 43a02c091e87551d4dfb6490a0a9b0b9c21fd612..c8409caf2854233f74095baf12712ede78099b1f 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import Mock, patch import pytest diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py index 238d6426bf0997b41a1ba03ebfcacc6e0d7e114b..67595010cb2a513f485ac3f0867e5d18c4c786f7 100644 --- a/tests/kernels/test_awq_marlin.py +++ b/tests/kernels/test_awq_marlin.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test AWQ with fused MoE Marlin kernels. Run `pytest tests/kernels/test_awq_marlin.py`. diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/test_awq_triton.py index ea1e85e89ffb47db04a1c3525e69eaa41038f55c..267f1b11246dd521926275739109238401a79c9a 100644 --- a/tests/kernels/test_awq_triton.py +++ b/tests/kernels/test_awq_triton.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for the AWQ Triton kernel. Run `pytest tests/kernels/test_awq_triton.py`. diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py index f28fdf3feedbc9969164c3ac03178c4cba21faa7..20eff1c207239f6f203d0eac729832c3614fdc40 100644 --- a/tests/kernels/test_block_fp8.py +++ b/tests/kernels/test_block_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from https://github.com/sgl-project/sglang/pull/2575 import itertools diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index 82085eff965a69f0049a50cb533282252ad7238b..e3abc319f6dd491b9ec59d4401e74cff19fba7f6 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import List, Optional, Tuple diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index f93592a4aa8a635d34057de285660fa1f4397d2f..4c44317d8e59570ded11f55b3a0bd229b46ad84b 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import List, Tuple @@ -7,6 +9,7 @@ import torch from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS from vllm import _custom_ops as ops from vllm.platforms import current_platform +from vllm.utils import align_to_256bytes COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -16,6 +19,13 @@ NUM_HEADS = [8] # Arbitrary values for testing HEAD_SIZES = [64, 80, 120, 256] BLOCK_SIZES = [8, 16, 32] +# Parameters for MLA tests. +KV_LORA_RANKS = [512] +QK_ROPE_HEAD_DIMS = [64] +NUM_TOKENS_MLA = [42] +BLOCK_SIZES_MLA = [16] +NUM_BLOCKS_MLA = [8] + # Arbitrary values for testing # don't make it too large. e.g. [1024, 36000] will OOM NUM_BLOCKS = [1024, 10000] @@ -433,3 +443,257 @@ def test_fp8_e4m3_conversion( ops.convert_fp8(converted_cache, cache_fp8) torch.testing.assert_close(cache, converted_cache, atol=0.001, rtol=0.1) + + +def _create_mla_cache( + num_blocks: int, + block_size: int, + entry_size: int, + dtype: torch.dtype, + kv_cache_dtype: str, + device: str, + align_cache: bool, +) -> torch.Tensor: + cache_dtype = torch.uint8 if kv_cache_dtype == "fp8" else dtype + + if align_cache: + alloc_entry_size = align_to_256bytes(entry_size, cache_dtype) + alloc_shape = (num_blocks, block_size, alloc_entry_size) + cache_full = torch.zeros(alloc_shape, dtype=cache_dtype, device=device) + cache = cache_full[..., :entry_size] + else: + cache = torch.zeros(num_blocks, + block_size, + entry_size, + dtype=cache_dtype, + device=device) + return cache + + +def _fill_mla_cache(cache: torch.Tensor, kv_cache_dtype: str): + rand_dtype = torch.float16 if kv_cache_dtype == "fp8" else cache.dtype + + vals = torch.randn(*cache.shape, device=cache.device, dtype=rand_dtype) + if kv_cache_dtype == "fp8": + temp = torch.zeros_like(cache) + ops.convert_fp8(temp, vals, 1.0, kv_dtype=kv_cache_dtype) + vals = temp + cache.copy_(vals) + + +@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) +@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) +@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA) +@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA) +@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) +@pytest.mark.parametrize("align_cache", [False]) +@torch.inference_mode() +def test_concat_and_cache_mla( + kv_lora_rank: int, + qk_rope_head_dim: int, + num_tokens: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + seed: int, + device: str, + kv_cache_dtype: str, + align_cache: bool, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + + total_slots = num_blocks * block_size + slot_mapping_lst = random.sample(range(total_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, + dtype=torch.long, + device=device) + + kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device) + k_pe = torch.randn(num_tokens, + qk_rope_head_dim, + dtype=dtype, + device=device) + entry_size = kv_lora_rank + qk_rope_head_dim + + scale = torch.tensor(0.1, dtype=torch.float32, device=device) + kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, + kv_cache_dtype, device, align_cache) + ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device) + + for i in range(num_tokens): + slot = slot_mapping[i].item() + block_idx = slot // block_size + block_offset = slot % block_size + ref_temp[block_idx, block_offset, :kv_lora_rank] = kv_c[i] + ref_temp[block_idx, block_offset, kv_lora_rank:] = k_pe[i] + + if kv_cache_dtype == "fp8": + ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype) + ops.convert_fp8(ref_kv_cache, + ref_temp, + scale.item(), + kv_dtype=kv_cache_dtype) + else: + ref_kv_cache = ref_temp + + opcheck( + torch.ops._C_cache_ops.concat_and_cache_mla, + (kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale), + test_utils=DEFAULT_OPCHECK_TEST_UTILS, + ) + + ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping, + kv_cache_dtype, scale) + + if kv_cache_dtype == "fp8": + result_temp = torch.empty_like(kv_cache, dtype=torch.float16) + ops.convert_fp8(result_temp, + kv_cache.contiguous(), + scale.item(), + kv_dtype=kv_cache_dtype) + expected_temp = torch.empty_like(ref_kv_cache, dtype=torch.float16) + ops.convert_fp8(expected_temp, + ref_kv_cache, + scale.item(), + kv_dtype=kv_cache_dtype) + torch.testing.assert_close(result_temp, + expected_temp, + atol=0.001, + rtol=0.1) + else: + torch.testing.assert_close(kv_cache, ref_kv_cache) + + +@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) +@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) +@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA) +@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA) +@pytest.mark.parametrize("num_layers", NUM_LAYERS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) +@pytest.mark.parametrize("align_cache", [False, True]) +@torch.inference_mode() +def test_copy_blocks_mla( + kv_lora_rank: int, + qk_rope_head_dim: int, + block_size: int, + num_blocks: int, + num_layers: int, + dtype: torch.dtype, + seed: int, + device: str, + kv_cache_dtype: str, + align_cache: bool, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + + entry_size = kv_lora_rank + qk_rope_head_dim + + kv_caches = [] + for _ in range(num_layers): + kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, + kv_cache_dtype, device, align_cache) + _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype) + kv_caches.append(kv_cache) + + ref_caches = [kv_cache.clone() for kv_cache in kv_caches] + + num_mappings = min(2, num_blocks // 2) + src_blocks = random.sample(range(num_blocks), num_mappings) + remaining = list(set(range(num_blocks)) - set(src_blocks)) + dst_blocks = random.sample(remaining, 2 * num_mappings) + block_mapping = [] + for i in range(num_mappings): + src = src_blocks[i] + dst1 = dst_blocks[2 * i] + dst2 = dst_blocks[2 * i + 1] + block_mapping.append((src, dst1)) + block_mapping.append((src, dst2)) + block_mapping_tensor = torch.tensor(block_mapping, + dtype=torch.int64, + device=device).view(-1, 2) + + for src, dst in block_mapping: + for ref_cache in ref_caches: + ref_cache[dst].copy_(ref_cache[src]) + + opcheck( + torch.ops._C_cache_ops.copy_blocks_mla, + (kv_caches, block_mapping_tensor), + test_utils=DEFAULT_OPCHECK_TEST_UTILS, + ) + ops.copy_blocks_mla(kv_caches, block_mapping_tensor) + + for kv_cache, ref_cache in zip(kv_caches, ref_caches): + torch.testing.assert_close(kv_cache, ref_cache) + + +@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS) +@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS) +@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA) +@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) +@pytest.mark.parametrize("align_cache", [False, True]) +@torch.inference_mode() +def test_swap_blocks_mla( + kv_lora_rank: int, + qk_rope_head_dim: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + seed: int, + device: str, + kv_cache_dtype: str, + align_cache: bool, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + + entry_size = kv_lora_rank + qk_rope_head_dim + + src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, + kv_cache_dtype, device, align_cache) + dst_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype, + kv_cache_dtype, device, align_cache) + + _fill_mla_cache(src_cache, kv_cache_dtype) + _fill_mla_cache(dst_cache, kv_cache_dtype) + + src_cache_clone = src_cache.clone() + + num_mappings = min(2, num_blocks // 2) + src_blocks = random.sample(range(num_blocks), num_mappings) + remaining_blocks = list(set(range(num_blocks)) - set(src_blocks)) + dst_blocks = random.sample(remaining_blocks, num_mappings) + block_mapping = list(zip(src_blocks, dst_blocks)) + block_mapping_tensor = torch.tensor(block_mapping, + dtype=torch.int64, + device="cpu").view(-1, 2) + + opcheck( + torch.ops._C_cache_ops.swap_blocks, + (src_cache, dst_cache, block_mapping_tensor), + test_utils=DEFAULT_OPCHECK_TEST_UTILS, + cond=(kv_lora_rank == KV_LORA_RANKS[0] + and qk_rope_head_dim == QK_ROPE_HEAD_DIMS[0]), + ) + + ops.swap_blocks(src_cache, dst_cache, block_mapping_tensor) + + for src, dst in block_mapping: + torch.testing.assert_close( + src_cache_clone[src].cpu(), + dst_cache[dst].cpu(), + msg=f"Block {src} from src should have been swapped to block " + f"{dst} in dst_cache.") diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py index 8edfde42ede74270bc4c2b19ca957e682064d094..8cc1a6a1b49f330caf3f5969739ab3e5a545905a 100755 --- a/tests/kernels/test_cascade_flash_attn.py +++ b/tests/kernels/test_cascade_flash_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple import pytest diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index f12d41ad909af2a8bb3ea5c58db7108bafbff2f3..b799a958c22883cc98d4a35055305c3f413fb9f0 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for cutlass kernels Run `pytest tests/kernels/test_cutlass.py`. diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py index 56495df34aa6c3dc511fd589c26f51a2cf29ffce..4c613b75fc6f5c10ea62c8542c75c2ab54469b08 100644 --- a/tests/kernels/test_cutlass_2of4_sparse.py +++ b/tests/kernels/test_cutlass_2of4_sparse.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for sparse cutlass kernels Run `pytest tests/kernels/test_semi_structured.py`. diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index 2e726ce740be78f45520f4c32ba9be838430876e..7a06852d6ca9eb3f548ddf3fc4feda0a1596a1ff 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Tests: diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 8121703fe0bc20b33d1f8cb20512f5c98044d914..b8e34637c59e45ab489b27fb0f51b27433e8f568 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple import pytest diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py index baf8d73fdbffbabb75ce6ef4815e300416db7ec6..d4b674b23534006b6f4c7dccc3760cdf91601c91 100644 --- a/tests/kernels/test_fused_quant_layernorm.py +++ b/tests/kernels/test_fused_quant_layernorm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple, Union import pytest diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py index 9c62c0ec369fd42f32b1ac305fb5dbf628e353a1..21e0dddab40ca56de3254fb2c8b60c7063c390ea 100644 --- a/tests/kernels/test_gguf.py +++ b/tests/kernels/test_gguf.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from pathlib import Path from typing import List diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index f5242824ef6bfe15ff5b78d5fc8b33d8149b1cc3..f6c72683a6c7b82cf97fc936f588498c47d540a7 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index c56e46b221cc5e19ed58f28d88846b1b5f29f304..303e5b2fa2978677841e80b75494083348a0f74d 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/test_machete_mm.py index 1c6eb2dd9a2281f25e10e8b8b4dfbab33b5e00ab..bd60526ed9b765f45116579c3fc489c28285969a 100644 --- a/tests/kernels/test_machete_mm.py +++ b/tests/kernels/test_machete_mm.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for the machete kernel. Run `pytest tests/kernels/test_machete_mm.py`. diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/test_mha_attn.py index eab874e9e02bb7b94fb0b274e8c122599d496d55..5a18b7916f0f69caee074032db2f580544134537 100644 --- a/tests/kernels/test_mha_attn.py +++ b/tests/kernels/test_mha_attn.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Test: diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 16dc0842b4b05c27824aa8e54ce6d095fb8b199e..f51a5848325f144df2886e327619b8102ce9649c 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for the MOE layers. Run `pytest tests/kernels/test_moe.py`. diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 5176d47133ed2c01666f9ff0dc1c9544193337d8..23e89547d5308da04b24f87d54e7ff400d3c31c5 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import accumulate, product from typing import Dict, List, Optional diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 2cfcada024b1540f32dd6d8a7f47d9228c276f2c..c1893db8016b60fba725e12f0462406a583d864f 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math import random import time diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py new file mode 100644 index 0000000000000000000000000000000000000000..5848dc014ca6997ee26701f4460a5b5125096781 --- /dev/null +++ b/tests/kernels/test_rocm_attention_selector.py @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import patch + +import pytest +import torch + +from tests.kernels.utils import override_backend_env_variable +from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend +from vllm.platforms.rocm import RocmPlatform + + +@pytest.fixture(autouse=True) +def clear_cache(): + """Clear lru cache to ensure each test case runs without caching. + """ + _cached_get_attn_backend.cache_clear() + + +def test_selector(monkeypatch): + """Test that the attention selector for ROCm. + """ + override_backend_env_variable(monkeypatch, "ROCM_FLASH") + + with patch("vllm.attention.selector.current_platform", RocmPlatform()): + backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) + assert backend.get_name() == "ROCM_FLASH" + # mla test for deepseek related + backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, + False, True) + assert backend.get_name() == "TRITON_MLA" diff --git a/tests/kernels/test_rotary_embedding.py b/tests/kernels/test_rotary_embedding.py index da879406b3936833b041daecc48ed035c70c552a..362bcb35ceabf399cbbc9f907926ec5b1537b929 100644 --- a/tests/kernels/test_rotary_embedding.py +++ b/tests/kernels/test_rotary_embedding.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Tests for miscellaneous utilities """ diff --git a/tests/kernels/test_triton_decode_attention.py b/tests/kernels/test_triton_decode_attention.py index 14f5a3b770b69677496e915811134ec228a6f004..fd3c9fa4196a7260a97cfb4bcb62a073d529dc71 100644 --- a/tests/kernels/test_triton_decode_attention.py +++ b/tests/kernels/test_triton_decode_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py index a5aab3c2ea4b021ac0864be13970ff21b8d496da..d878ed6f45144afaec0262e665bb06ff6e663367 100644 --- a/tests/kernels/test_triton_scaled_mm.py +++ b/tests/kernels/test_triton_scaled_mm.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for the triton_scaled_mm kernel Run `pytest tests/kernels/test_triton_scaled_mm.py`. diff --git a/tests/kernels/test_utils.py b/tests/kernels/test_utils.py index a5a84ac3080d99026dc4d64a185f22dbcc1713e9..58b0c78a580b2e19f2389a84f04217f3b56bf90b 100644 --- a/tests/kernels/test_utils.py +++ b/tests/kernels/test_utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Tests for miscellaneous utilities """ diff --git a/tests/kernels/untest_aqlm.py b/tests/kernels/untest_aqlm.py index 860fb66b17354719d8b4b3f3591e7d81a239e967..7d36172815b78672f24afe651e0717051d256a74 100644 --- a/tests/kernels/untest_aqlm.py +++ b/tests/kernels/untest_aqlm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from tests.kernels.utils import opcheck diff --git a/tests/kernels/untest_awq.py b/tests/kernels/untest_awq.py index aa7a430850f9af1b742a5f1e4e3f195d562c8792..ace75a3361734f6f116efcd6daac2299a79db0b5 100644 --- a/tests/kernels/untest_awq.py +++ b/tests/kernels/untest_awq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/kernels/untest_causal_conv1d.py b/tests/kernels/untest_causal_conv1d.py index 51be2425d7dd78c3005ceea83ec8eb51176934ed..93064e23dd7d1dabfb1cb2100329198aa2219cff 100644 --- a/tests/kernels/untest_causal_conv1d.py +++ b/tests/kernels/untest_causal_conv1d.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional import pytest diff --git a/tests/kernels/untest_flashinfer.py b/tests/kernels/untest_flashinfer.py index 1645ef911d6979543478f611d24fdb22be8dea49..212ceb5e4174683bbee7742ee624eaebb9edf76d 100644 --- a/tests/kernels/untest_flashinfer.py +++ b/tests/kernels/untest_flashinfer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple import flashinfer diff --git a/tests/kernels/untest_fp8_quant.py b/tests/kernels/untest_fp8_quant.py index ebaaae2321885d279d3b54fe50e5faa743060f4f..876cf03fd644c9076aad2f49dc35fe9f0caca954 100644 --- a/tests/kernels/untest_fp8_quant.py +++ b/tests/kernels/untest_fp8_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/kernels/untest_ggml.py b/tests/kernels/untest_ggml.py index dddb285bf26ec5dcafc2b8f4247b296d428614f1..dc728fd4861df9a701545484092e0296a56faaec 100644 --- a/tests/kernels/untest_ggml.py +++ b/tests/kernels/untest_ggml.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import gguf import pytest import torch diff --git a/tests/kernels/untest_gptq.py b/tests/kernels/untest_gptq.py index c1ca6f1f5191b8271de9499aedbd77f6d2aa5107..fea013d9e5795aef49574c2d238a5a09165d6eaf 100644 --- a/tests/kernels/untest_gptq.py +++ b/tests/kernels/untest_gptq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from tests.kernels.utils import opcheck diff --git a/tests/kernels/untest_mamba_ssm.py b/tests/kernels/untest_mamba_ssm.py index 19d1158c79c7318d01ed6108078b48b9165e75e1..84d4c347e0d81683b6ee882ea2924635015b6935 100644 --- a/tests/kernels/untest_mamba_ssm.py +++ b/tests/kernels/untest_mamba_ssm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch import torch.nn.functional as F diff --git a/tests/kernels/untest_marlin_gemm.py b/tests/kernels/untest_marlin_gemm.py index 5e047f4b099f1825cd9232510650982b2702cf29..b96aca06cdff3c051379978f7b576bc02a7c3fe4 100644 --- a/tests/kernels/untest_marlin_gemm.py +++ b/tests/kernels/untest_marlin_gemm.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for the marlin kernel. Run `pytest tests/kernels/marlin/test_marlin_gemm.py`. diff --git a/tests/kernels/untest_permute_cols.py b/tests/kernels/untest_permute_cols.py index 14ad7a22cf7cf6bf37e308d540d4a55b40154e1b..35d62079fb65ded26b8f405a8f7056ec8565417b 100644 --- a/tests/kernels/untest_permute_cols.py +++ b/tests/kernels/untest_permute_cols.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index c735c5edd7a362b9d4f90a0332f875aab8d66b07..5be111d7130829e43bf7d59722f5d5482b078311 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Kernel test utils""" import itertools diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py index 4a2ae8047641eb034d41e5d0f7519729e8095188..98b273625e4374afabde1553f2d0e513e7e0a8c9 100644 --- a/tests/kv_transfer/disagg_test.py +++ b/tests/kv_transfer/disagg_test.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import subprocess import sys diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/module_test.py index 355461919cd7c31012570f7479b5bc04f2d1d8b6..8a6490b5c8876f1d573e40004498a79f6b3e30e1 100644 --- a/tests/kv_transfer/module_test.py +++ b/tests/kv_transfer/module_test.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import subprocess import sys diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py index 4d6890305af737559b9d31a8720cda9bde37bfc7..c5b34660d1658d8ffab150742fcd08cf48d2daf2 100644 --- a/tests/kv_transfer/test_lookup_buffer.py +++ b/tests/kv_transfer/test_lookup_buffer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import random diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py index 1cc1ced9968d74d41b86f2a5640461f6a2ae7463..181a5ac207fe5f098cd6a6b6677196323b7474fa 100644 --- a/tests/kv_transfer/test_send_recv.py +++ b/tests/kv_transfer/test_send_recv.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import time from typing import List diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 989c6623bcc4ab9546f4ed67e3f00fb9b314d408..afce599a84afb8a5eefe2e54545f0fb86af84c0a 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import tempfile from collections import OrderedDict from typing import Dict, List, TypedDict diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py index 61b8899f0533c047963ee55507b05d65bf8264fe..2d33f738bd87479c314073d56bdbf6dca5cd7b9d 100644 --- a/tests/lora/data/long_context_test_data.py +++ b/tests/lora/data/long_context_test_data.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa """This file contains a dictionary of prompts and golden responses.""" diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 0bbf8450d456bef4a249c1e3478f54b4e7fff303..d9d30cf70144f573b232552850ed82cdf9c821b3 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index 23f775726292c6bd43eb204235cba08bc0a20ffe..9d9aea70245bce56f24242a7435f16d7f1797370 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import os diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index cdd7eeff630e9943b01527aaf2b56358b9960ffe..39b7af96f4600a00e006ac9491f13edae788847a 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py index 6aa33926cb6b8bd835ddc95ec4aee82f20962b86..c04174665897cd0084e051dd7cb21691286701a0 100644 --- a/tests/lora/test_jamba.py +++ b/tests/lora/test_jamba.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 08a589d7ee29c36a3ee66ac7982e9cacccec9683..0838ca02c9b7a6a18708497768b422fb2692cd1a 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from copy import deepcopy from dataclasses import dataclass diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 6e762ad442da32c7bc537655e7cb6e2af8aa74f0..5517b9bdb1bbd2a4ee2bc9e0a1de2be18fdd5ea3 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import ray diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index 09285eb1fe22be266a59dd01af1bd135e81540d4..bd538c9911701caf36afd88d6e4683f24b156c82 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ast from typing import List, Optional, Tuple diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py index 141c1801a11bdc1e36bd91e720798d029e284cde..0bc18418bfc86481fdac0a6b4e8b52d29382aad5 100644 --- a/tests/lora/test_lora_bias_e2e.py +++ b/tests/lora/test_lora_bias_e2e.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import os diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index b907af47d08d7f13488370f3426ed7d3ac2a7813..d2a4b901bd8d7503b56bd6404c488eb49c028bca 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 1c0ee01c038d0dbdd5022ebdd2d711fd2cbca0e7..273fe9ae0eb55c0fe91324db49bac99b01dac977 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 9a5b9aabf5078fa54ea4919908302d3f4ebd0c4c..6666f54fdebd1cacd38a6cf663c5a863b5698c15 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Dict, List diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index 0fa6ff45dd63b407691bd99e427efc04c987986e..4f47d7dcdb1e7406e7a8f5da429e28550ee029d5 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import os diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 8307a6b6d15130f5c6031afde36fb36d529360a4..5660f3f5b82fa267aa3888ded1b0f9e4706619bc 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py index a524d5ce5f34ae51871eeadb19c9d1d33c31f1ef..9935472ad18f457e1398520af12c0b12d7915f64 100644 --- a/tests/lora/test_peft_helper.py +++ b/tests/lora/test_peft_helper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import math import shutil diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index 5ab0eca1f1a89d347d5ecc5eadea6273106b0174..6279c7047beaf5ce263d34563e41f9fe98ee064b 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import os diff --git a/tests/lora/test_punica_ops_sizes.py b/tests/lora/test_punica_ops_sizes.py index 433ca7577d0848cd87b798b90bd4e2c128412d88..ecd3bc4978f3911e601b83322913ce070e87a3e9 100644 --- a/tests/lora/test_punica_ops_sizes.py +++ b/tests/lora/test_punica_ops_sizes.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This script is mainly used to tests various hidden_sizes. We have collected the hidden_sizes included in the LoRA models currently supported by vLLM. It tests diff --git a/tests/lora/test_punica_ops_variation.py b/tests/lora/test_punica_ops_variation.py index 48b4686cfbb8616738c2d898c7cacf6f324f89d1..53b5cb744db90eaf7d4fc5d0849053d199dfac41 100644 --- a/tests/lora/test_punica_ops_variation.py +++ b/tests/lora/test_punica_ops_variation.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This script is mainly used to test whether trtion kernels can run normally under different conditions, including various batches, numbers of LoRA , and diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 1c1ccfd2e54a2968ab75b516c5119e09e4481a79..00782445937199fa11a8091b88f8f074c504aeef 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py from dataclasses import dataclass diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 570aa3861d0be6113a23a77cd48fd69f7b63f491..a988f06ab25f059c36b2ff5b6db36e97c1da12fa 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index 0b0d913a5136645a759e1ee36be62f62fcf22c82..d4905b02cac06fffdb8eb1a6bcf7345b11eb4719 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py new file mode 100644 index 0000000000000000000000000000000000000000..703f92ce8b6bca6ef6cb667ba03c60268fe08c11 --- /dev/null +++ b/tests/lora/test_ultravox.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 + +import shutil +from os import path +from tempfile import TemporaryDirectory +from typing import List, Tuple + +import torch +from huggingface_hub import snapshot_download +from safetensors.torch import load_file, save_file +from transformers import AutoTokenizer + +from vllm.lora.request import LoRARequest + +from ..models.utils import check_outputs_equal + +ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3" +LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" + +VLLM_PLACEHOLDER = "<|reserved_special_token_0|>" + +PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!" + + +def llama3_1_8b_chess_lora_path(): + return snapshot_download( + repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b") + + +# can't use llama lora adapter without module name transformation +# because ultravox nest language model +def transform_module_names_for_ultravox(state_dict): + transformed_state_dict = {} + for key, value in state_dict.items(): + new_key = key.replace("base_model.model", + "base_model.model.language_model") + transformed_state_dict[new_key] = value + return transformed_state_dict + + +def mk_llama3_1_8b_ultravox_chess_lora(source_repo, target_path): + tensor_file = "adapter_model.safetensors" + state_dict = load_file(path.join(source_repo, tensor_file)) + transformed_state_dict = transform_module_names_for_ultravox(state_dict) + + save_file(transformed_state_dict, path.join(target_path, tensor_file)) + + config_file = "adapter_config.json" + shutil.copyfile(path.join(source_repo, config_file), + path.join(target_path, config_file)) + return target_path + + +def _get_prompt(audio_count, question, placeholder, model_name) -> str: + tokenizer = AutoTokenizer.from_pretrained(model_name) + placeholder = f"{placeholder}\n" * audio_count + + return tokenizer.apply_chat_template([{ + 'role': 'user', + 'content': f"{placeholder}{question}" + }], + tokenize=False, + add_generation_prompt=True) + + +def test_ultravox_lora(vllm_runner): + """ + TODO: Train an Ultravox LoRA instead of using a Llama LoRA. + """ + # Workaround to prevent device mismatch in Whisper. + # Can be removed when it is fixed upstream in transformer + # https://github.com/huggingface/transformers/pull/35866 + torch.set_default_device("cpu") + + llama3_1_8b_chess_lora = llama3_1_8b_chess_lora_path() + with TemporaryDirectory() as temp_ultravox_lora_dir: + llama3_1_8b_ultravox_chess_lora = mk_llama3_1_8b_ultravox_chess_lora( + llama3_1_8b_chess_lora, temp_ultravox_lora_dir) + with vllm_runner( + ULTRAVOX_MODEL_NAME, + enforce_eager=True, + max_num_seqs=2, + enable_lora=True, + max_loras=1, + max_lora_rank=128, + dtype="bfloat16", + max_model_len=1024, + ) as vllm_model: + ultravox_outputs: List[Tuple[ + List[int], str]] = vllm_model.generate_greedy( + [ + _get_prompt(0, PROMPT, VLLM_PLACEHOLDER, + ULTRAVOX_MODEL_NAME) + ], + 256, + lora_request=LoRARequest(str(1), 1, + llama3_1_8b_ultravox_chess_lora), + ) + + # run llama with and without lora to compare outputs with above + with vllm_runner( + LLMA_MODEL_NAME, + enforce_eager=True, + max_num_seqs=2, + enable_lora=True, + max_loras=1, + max_lora_rank=128, + dtype="bfloat16", + max_model_len=1024, + ) as vllm_model: + llama_outputs: List[Tuple[List[int], str]] = ( + vllm_model.generate_greedy( + [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)], + 256, + lora_request=LoRARequest(str(1), 1, llama3_1_8b_chess_lora), + )) + + check_outputs_equal( + outputs_0_lst=ultravox_outputs, + outputs_1_lst=llama_outputs, + name_0="ultravox", + name_1="llama", + ) diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index 85110b8fa8cd2ecede7e67ae02be4a1c8e3193bf..34a26e9edf36ac03129da4bbd946d8c319cead68 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import OrderedDict from unittest.mock import patch diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 375d0dc1492017a877538c9b677cad10d92d82f4..6d706cedb137b32fff9eedd176d73e1dc085de5f 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import random import tempfile diff --git a/tests/lora/utils.py b/tests/lora/utils.py index ce47546f2154b7e14161c9122886324511684c51..bda00e08190ef970ccd07ce113e95744eee1dde6 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List, Optional import torch diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index e042ee324899808d28d0e0108ed9d5f8dd7f6a39..2d2de59744adcfa321a64dd2f6981a605c54c626 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import time from typing import List diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py index 10792b0a049993478eeb53d05b2a56bd45337394..b588a1a96638bdc265e434696471f9d6a13e5bb0 100644 --- a/tests/model_executor/conftest.py +++ b/tests/model_executor/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 0a3aba255fd765ce12671d84e85bb5c70cd0cf61..2c678084856708084a3671173598fa192aa0ba2f 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py index a8fe3117c220cb1a2069797828c640fdccd8c0a7..7a600c2028347d1d48136a19fd14d49c5a3fdd47 100644 --- a/tests/model_executor/test_guided_processors.py +++ b/tests/model_executor/test_guided_processors.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle import pytest diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index ea7c9182653fa193f05bed2c23b2abdb8d33e54d..79b35e81231529ae62c1c8ff69ed151f84535f48 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py index 0ef80c0a5700badf83fe5cb5aaaa0d643e76fd6b..ca59fc14fc0a6cfade33243eb963126d3b7959e5 100644 --- a/tests/model_executor/weight_utils.py +++ b/tests/model_executor/weight_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import tempfile diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index 977d8689941eab478ae4cf09d2c34c99e60f506c..ed65e6ae87f50fa2fa64b2e609b517a1b94407ff 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple, Type import numpy as np diff --git a/tests/models/decoder_only/language/test_aqlm.py b/tests/models/decoder_only/language/test_aqlm.py index 0fd17f70a72d4e3684d61efe54b6d8d9af0757fc..b3a581e317f7b8c58c2253c7af2afaae4e4ffa1a 100644 --- a/tests/models/decoder_only/language/test_aqlm.py +++ b/tests/models/decoder_only/language/test_aqlm.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of a AQLM model between vLLM and HF Transformers Run `pytest tests/models/test_aqlm.py`. diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index a60545ae844899f9f4f31c3659dfef2f023b2391..c07d33e991710018b4f3c6dcd08f03d2199f9915 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # flake8: noqa """Tests fp8 models against ground truth generation Note: these tests will only pass on L4 GPU. diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 4e9268d8ea00f1e4348c0df14f85618880a4bc51..3a37f65c503f9433c9947f9a22eb54a801fd292a 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Tests gguf models against unquantized models generations Note: To pass the test, quantization higher than Q4 should be used diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py index 6147ccd0e6a9a49e1da02c2e06ff0acd386b645a..3a66d603a71f3582ecc534dcd466e02df99cc768 100644 --- a/tests/models/decoder_only/language/test_gptq_marlin.py +++ b/tests/models/decoder_only/language/test_gptq_marlin.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compares the outputs of gptq vs gptq_marlin Note: GPTQ and Marlin do not have bitwise correctness. As a result, in this test, we just confirm that the top selected tokens of the diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py index 2278587c55d1953db145fdab2354ddebe7fb5c8e..11ed17975d6e539f0c253805a849471ead3d256f 100644 --- a/tests/models/decoder_only/language/test_gptq_marlin_24.py +++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of a GPTQ model to a Marlin_24 model. Note: GPTQ and Marlin_24 do not have bitwise correctness. diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py index e7a4dd1bb5cc91ab600083fee018cfed388e9d19..62ced24a011ef7c6b9359bf1634ed3d24b292cb0 100644 --- a/tests/models/decoder_only/language/test_granite.py +++ b/tests/models/decoder_only/language/test_granite.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM for Granite models using greedy sampling. Run `pytest tests/models/test_granite.py`. diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 242fc7a22b900ebf18f5dd0d2c91fa5691e9d923..585b5087330b4e372637a2c4abebf43b30bb869b 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import os diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 9fb87babaa44a4709b704f38202fd9ba190c7a87..e392371c69e5b1cfb39d42b9b8451e9b64573319 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM when using greedy sampling for Mamba. Run `pytest tests/models/test_mamba.py`. diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index b6c208b156aca779c9961a2f5263037b05753078..e45dd95d389f679f58577d0c1fd7d124f95d8a91 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling. Run `pytest tests/models/test_mistral.py`. diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/decoder_only/language/test_modelopt.py index fd1d34d8ee20fd598c20d7b0bf20e1e9387cb79b..e1a8c7ec5baf0ab36616981e7badab0e4dbce185 100644 --- a/tests/models/decoder_only/language/test_modelopt.py +++ b/tests/models/decoder_only/language/test_modelopt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # flake8: noqa """Tests Model Optimizer fp8 models against ground truth generation Note: these tests will only pass on H100 diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 33d47d74a96e4d7fb7fc27df40bf366e15b04345..2adde4b8494cbf35a1d2eec65767abf86b6d3f9a 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM when using greedy sampling. Run `pytest tests/models/test_models.py`. diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py index 85d40182078152d16984283eeb155399671ca9bb..0bfb02e5bb4fa488583148e54d064d24901f6eff 100644 --- a/tests/models/decoder_only/language/test_phimoe.py +++ b/tests/models/decoder_only/language/test_phimoe.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM for moe models using greedy sampling. Run `pytest tests/models/test_phimoe.py`. diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py index a81b6617c4cd59cbaa5467f812d96b91ac44925c..29c3273a1464cb81722150488c289b8b4dac532b 100644 --- a/tests/models/decoder_only/vision_language/test_awq.py +++ b/tests/models/decoder_only/vision_language/test_awq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Type import os diff --git a/tests/models/decoder_only/vision_language/test_intern_vit.py b/tests/models/decoder_only/vision_language/test_intern_vit.py index 53e4e4553f3506a32894192f6e0a1ac8bedfa95c..61194fdd4da791d46638b006244a47da44cc8253 100644 --- a/tests/models/decoder_only/vision_language/test_intern_vit.py +++ b/tests/models/decoder_only/vision_language/test_intern_vit.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional import os diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 1e68acb4fc2b9d7d73b755155f439ef3a52a0474..0653eaf78824ebae13630b7a59cc71e934261ffb 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Common tests for testing .generate() functionality for single / multiple image, embedding, and video support for different VLMs in vLLM. """ @@ -9,6 +10,7 @@ from typing import Type import os import pytest +from packaging.version import Version from transformers import AutoModelForVision2Seq from transformers import __version__ as TRANSFORMERS_VERSION @@ -121,6 +123,8 @@ VLM_TEST_SETTINGS = { else ("half", "float")), marks=[pytest.mark.core_model], ), + # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL + # once we upgraded to transformers>=4.49.0. "qwen2_vl": VLMTestInfo( models=[os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")], test_type=( @@ -138,6 +142,26 @@ VLM_TEST_SETTINGS = { image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), + "qwen2_5_vl": VLMTestInfo( + models=["Qwen/Qwen2.5-VL-3B-Instruct"], + test_type=( + VLMTestType.IMAGE, + VLMTestType.MULTI_IMAGE, + VLMTestType.VIDEO + ), + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, + image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + marks=[pytest.mark.skipif( + TRANSFORMERS_VERSION < "4.49.0", + reason="HF model requires transformers>=4.49.0", + ), pytest.mark.core_model, pytest.mark.cpu_model], + ), #### Extended model tests "aria": VLMTestInfo( models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")], @@ -155,13 +179,7 @@ VLM_TEST_SETTINGS = { stop_str=["<|im_end|>"], image_size_factors=[(0.10, 0.15)], max_tokens=64, - marks=[ - pytest.mark.skipif( - TRANSFORMERS_VERSION < "4.48.0", - reason="HF model requires transformers>=4.48.0", - ), - large_gpu_mark(min_gb=64), - ], + marks=[large_gpu_mark(min_gb=64)], ), "blip2": VLMTestInfo( models=[os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")], @@ -207,8 +225,8 @@ VLM_TEST_SETTINGS = { image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], marks=[ pytest.mark.skipif( - TRANSFORMERS_VERSION >= "4.48.0", - reason="HF model is not compatible with transformers>=4.48.0", + Version(TRANSFORMERS_VERSION) >= Version("4.48"), + reason="HF model is not compatible with transformers>=4.48", ) ], ), @@ -251,17 +269,18 @@ VLM_TEST_SETTINGS = { max_model_len=8192, dtype="bfloat16", use_tokenizer_eos=True, + num_logprobs=10, patch_hf_runner=model_utils.h2ovl_patch_hf_runner, ), "idefics3": VLMTestInfo( - models=[os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")], + models=[os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 img_idx_to_prompt=lambda idx: "", max_model_len=8192, max_num_seqs=2, auto_cls=AutoModelForVision2Seq, - marks=[large_gpu_mark(min_gb=48)], + hf_output_post_proc=model_utils.idefics3_trunc_hf_output, ), "intern_vl": VLMTestInfo( models=[ @@ -283,7 +302,6 @@ VLM_TEST_SETTINGS = { dtype="bfloat16", use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, - marks=[large_gpu_mark(min_gb=32)], ), "llava_next": VLMTestInfo( models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")], @@ -340,6 +358,12 @@ VLM_TEST_SETTINGS = { auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, patch_hf_runner=model_utils.mantis_patch_hf_runner, + marks=[ + pytest.mark.skipif( + Version(TRANSFORMERS_VERSION) >= Version("4.48"), + reason="HF model is not compatible with transformers>=4.48", + ) + ], ), "minicpmv_25": VLMTestInfo( models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")], diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index f6f829ca5d5f856ce5d0329b3f568cb7bc0a763a..7323494a8851eec239ae065e4778051841f0d135 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import re from typing import List, Optional, Tuple, Type diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py index fd58a139eef96c8b7e9f6225b2807bb88dc62d05..c614e61c1a3385b1e3a68637557a8d9d3fb2fc0e 100644 --- a/tests/models/decoder_only/vision_language/test_pixtral.py +++ b/tests/models/decoder_only/vision_language/test_pixtral.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling. Run `pytest tests/models/test_mistral.py`. diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index f62cb071d6bb4780125186ac452613ecb7e58c6d..b054359bf33ee582819c87cb9034e97d9880a26d 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, List, Optional, Tuple, Type, TypedDict, Union import os diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py index 59773be709fa8dcdc2b6f68d3e519941988af539..539410d18950acd05bf7a029e179baf8dbbf2432 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Helpers for building inputs that can be leveraged for different test types. """ from pathlib import PosixPath diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py index 9bb71341606593ecef231cca853cfd5ce4b8b593..ca4ec21411825086d47123d03290ab089dfa938a 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utils for determining which subset of model tests belong to a specific modality, getting all combinations (similar to pytest's parametrization), handling multimodal placeholder substitution, and so on. diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py index 54b7b0733210f9c9d5c24af0edc4e6c88695e11b..0aed267692ab1c57046cf01710153144bd3924a6 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/core.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Core test implementation to be shared across modalities.""" from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union @@ -153,4 +154,4 @@ def process_runner_outputs( def process_outputs(output_processor, model, outputs_per_image): """Applies a model specific post-processor function to a runner's output""" return [[output_processor(res, model) for res in outputs] - for outputs in outputs_per_image] \ No newline at end of file + for outputs in outputs_per_image] diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py index 2291f4fa0d0acee9fb3a86ab26b585b2abb4cc03..2f03a114ae531d07fc4779d9e3a838ab8bd30768 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Custom input builders for edge-cases in different models.""" from typing import Callable diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 07bdb2cee44d2c80efa41148a6118230fb90d7af..ced891e1e2c2069fe412912d5575d7e60b232c62 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Common utility functions relating to different models that are useful for manipulating the input / output of HF & vLLM test runners, which are typically specific to a small subset of models. @@ -191,6 +192,14 @@ def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, return output_ids, output_str, out_logprobs +def idefics3_trunc_hf_output(hf_output: RunnerOutput, + model: str) -> RunnerOutput: + output_ids, output_str, out_logprobs = hf_output + if output_str.endswith(""): + output_str = output_str.split("")[0] + return output_ids, output_str, out_logprobs + + def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput: output_ids, output_str, out_logprobs = hf_output @@ -333,12 +342,12 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: def __init__(self, hf_runner: HfRunner): self.num_image_token = hf_runner.model.num_image_token self.tokenizer = hf_runner.tokenizer - self.dtype = hf_runner.model.dtype self.config = AutoConfig.from_pretrained(hf_runner.model_name, trust_remote_code=True) self.vision_config = self.config.vision_config self.use_thumbnail = self.config.use_thumbnail + self.use_msac = self.config.use_msac self.min_num = self.config.min_dynamic_patch self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size @@ -347,18 +356,19 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: **kwargs): # yapf: disable from vllm.model_executor.models.h2ovl import ( - IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) + IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl) # yapf: enable images = [images] if isinstance(images, Image) else images pixel_values = [ - image_to_pixel_values(image, - self.image_size, - self.min_num, - self.max_num, - self.use_thumbnail, - use_MSAC=self.config.use_msac).to( - self.dtype) for image in images + image_to_pixel_values_h2ovl( + image, + input_size=self.image_size, + min_num=self.min_num, + max_num=self.max_num, + use_thumbnail=self.use_thumbnail, + use_msac=self.use_msac, + ) for image in images ] num_patches_list = [ pixel_value.shape[0] for pixel_value in pixel_values @@ -393,7 +403,6 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: def __init__(self, hf_runner: HfRunner): self.num_image_token = hf_runner.model.num_image_token self.tokenizer = hf_runner.tokenizer - self.dtype = hf_runner.model.dtype self.config = AutoConfig.from_pretrained(hf_runner.model_name, trust_remote_code=True) @@ -406,13 +415,17 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: def __call__(self, text: str, images: Union[Image, List[Image]], **kwargs): from vllm.model_executor.models.internvl import ( - IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) + IMG_CONTEXT, IMG_END, IMG_START, + image_to_pixel_values_internvl) images = [images] if isinstance(images, Image) else images pixel_values = [ - image_to_pixel_values(image, self.image_size, self.min_num, - self.max_num, - self.use_thumbnail).to(self.dtype) - for image in images + image_to_pixel_values_internvl( + image, + input_size=self.image_size, + min_num=self.min_num, + max_num=self.max_num, + use_thumbnail=self.use_thumbnail, + ) for image in images ] num_patches_list = [ pixel_value.shape[0] for pixel_value in pixel_values @@ -447,7 +460,8 @@ def _internvl_generate( ) -> torch.LongTensor: """Generate method for InternVL2 model without fixed use_cache.""" assert self.img_context_token_id is not None - vit_embeds = self.extract_feature(pixel_values) + target_dtype = next(self.parameters()).dtype + vit_embeds = self.extract_feature(pixel_values.to(target_dtype)) input_embeds = self.language_model.get_input_embeddings()(input_ids) B, N, C = input_embeds.shape input_embeds = input_embeds.reshape(B * N, C) diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py index 2d3b39fe3594ef2f7bee1010ece7976cdf27af80..fb9df37cad92a5067d0336ce2a4adcd4350b1ff9 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Entrypoints for wrapping the core run_test implementation for specific test types / modalities. """ diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py index e2e0c6390fcb9e02503f07c6d1b51b855fdb7f73..ae3b9d59bf9b6a49f8c1c6764068bf1c4745d7a1 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/types.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Types for writing multimodal model tests.""" from enum import Enum from pathlib import PosixPath diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index 3fc32266258219ab1e06aaf8c57bd4da3541ba58..08564d730d35cbd5536eda72a3c18b1de6e6940f 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the classification outputs of HF and vLLM models. Run `pytest tests/models/test_cls_models.py`. diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 21d3484fda4bb8213da2f3f0a7a21d05e65ebcb3..f31e203a8cb90048046035594a91ce7447671a81 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the embedding outputs of HF and vLLM models. Run `pytest tests/models/embedding/language/test_embedding.py`. diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py index ecc494f165db991e55e8acad0e8943877519d30c..c9736206c167c8f5e2f3edda70ca6eae38291646 100644 --- a/tests/models/embedding/language/test_gritlm.py +++ b/tests/models/embedding/language/test_gritlm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import importlib.util import math from array import array diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py index d2973432c2c8b83f6a34887628fbafff95264c23..46be976ee8f2ca60b83b69a0213f5f63997fed46 100644 --- a/tests/models/embedding/language/test_scoring.py +++ b/tests/models/embedding/language/test_scoring.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the scoring outputs of HF and vLLM models. Run `pytest tests/models/embedding/language/test_scoring.py`. diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py index f96c7d2b176dbe82b84a366e053e02d359b1e2f2..567aa50984937bbd8f4da12b0ee4c1baec3bc57c 100644 --- a/tests/models/embedding/utils.py +++ b/tests/models/embedding/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Sequence import torch diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py index 137393add34f47bc9d323cafb84375ec10ab24e9..6b6c9d5aaf572c6e29014e041938ba18358c4a9c 100644 --- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py +++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import partial from typing import Callable, Dict, List, Type diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index 321d10c14660102ba1979fc1b30a1b78339d0b5e..034f7c214aac2cb0a0a5def5298e5e0e6d6df9b1 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -1,9 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Type import os import pytest import torch.nn.functional as F -import transformers from transformers import AutoModelForVision2Seq from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner @@ -56,6 +57,10 @@ def _run_test( with hf_runner(model, dtype=dtype, auto_cls=AutoModelForVision2Seq) as hf_model: + # Patch the issue where generation_config.json is missing + hf_model.processor.patch_size = \ + hf_model.model.config.vision_config.patch_size + # Patch the issue where image_token_id # exceeds the maximum allowed vocab size hf_model.model.resize_token_embeddings( @@ -87,8 +92,6 @@ def _run_test( ) -@pytest.mark.skipif(transformers.__version__ >= "4.46", - reason="Model broken with changes in transformers 4.46") @pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py index d2ad60338776f86590f4140d7c16e8d3c19c064c..4fd07364147ad04c13bea0d75c625e095d4d9d81 100644 --- a/tests/models/embedding/vision_language/test_phi3v.py +++ b/tests/models/embedding/vision_language/test_phi3v.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Type import os diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/encoder_decoder/audio_language/test_whisper.py index eb238c5332139c48f265426a036c81ddf01ef6eb..80d6897da7e024195f57615575c893fe1ed1b00a 100644 --- a/tests/models/encoder_decoder/audio_language/test_whisper.py +++ b/tests/models/encoder_decoder/audio_language/test_whisper.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM for Whisper models using greedy sampling. Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`. diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py index 3280222109e7e7c6dec5bcfa91cf7c45a40772e9..92d19f997089fd9e9cbee669b742aec126144293 100644 --- a/tests/models/encoder_decoder/language/test_bart.py +++ b/tests/models/encoder_decoder/language/test_bart.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM for BART models using greedy sampling. Run `pytest tests/models/encoder_decoder/language/test_bart.py`. diff --git a/tests/models/encoder_decoder/vision_language/test_broadcast.py b/tests/models/encoder_decoder/vision_language/test_broadcast.py index 39311751db1d5ee720a4046d06a022aedf126274..14bf0db07239651990d952df7852f0925677578d 100644 --- a/tests/models/encoder_decoder/vision_language/test_broadcast.py +++ b/tests/models/encoder_decoder/vision_language/test_broadcast.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import os diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py index 028086bdcd006d4f43c7d597f5d3af10856de805..f37903ee0f53435f847bd21c9f2f33ac0f4a174c 100644 --- a/tests/models/encoder_decoder/vision_language/test_florence2.py +++ b/tests/models/encoder_decoder/vision_language/test_florence2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import partial from typing import List, Optional, Tuple, Type diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 58e5a785af9ed9465b3f8fd00242003d683d4434..90b8b117bb64739ad4e9d8369ae882351626e40a 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple, Type, overload import os diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index ca28da268fa05bc796a519c25badfe855065d373..77cf3442df905075b9f927bf7c321bb5970466bb 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import partial import numpy as np @@ -139,13 +141,15 @@ def _test_processing_correctness( # yapf: disable -# True if the model supports multiple data items of the modality per request @pytest.mark.parametrize("model_id", [ "rhymes-ai/Aria", "Salesforce/blip2-opt-2.7b", "facebook/chameleon-7b", "deepseek-ai/deepseek-vl2-tiny", "adept/fuyu-8b", + "h2oai/h2ovl-mississippi-800m", + "OpenGVLab/InternVL2-1B", + "HuggingFaceM4/Idefics3-8B-Llama3", "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf", @@ -154,8 +158,10 @@ def _test_processing_correctness( "mistral-community/pixtral-12b", "openbmb/MiniCPM-o-2_6", "openbmb/MiniCPM-V-2_6", + "nvidia/NVLM-D-72B", "Qwen/Qwen-VL-Chat", "Qwen/Qwen2-VL-2B-Instruct", + "Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct", "fixie-ai/ultravox-v0_3", ]) diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py new file mode 100644 index 0000000000000000000000000000000000000000..767ac5eb9ef9a1b8376353ef1f34db0b680afaad --- /dev/null +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Tests for H2OVL's multimodal preprocessing kwargs.""" +from typing import Optional + +import pytest + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.utils import cached_get_tokenizer + +from ....conftest import _ImageAssets +from ...utils import build_model_context + + +@pytest.mark.parametrize("model_id", [ + "h2oai/h2ovl-mississippi-800m", + "h2oai/h2ovl-mississippi-2b", +]) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8]) +@pytest.mark.parametrize("dynamic_image_size", [True, False]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_override( + model_id: str, + image_assets: _ImageAssets, + size_factors: list[int], + max_dynamic_patch: int, + dynamic_image_size: Optional[bool], + num_imgs: int, +): + from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets, + get_h2ovl_target_ratios) + + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + trust_remote_code=True, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + tokenizer = cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=tokenizer, + ) + + config = processor.info.get_hf_config() + use_msac = config.use_msac + + mm_processor_kwargs = { + "max_dynamic_patch": max_dynamic_patch, + } + if dynamic_image_size is not None: + mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size + + min_num = config.min_dynamic_patch + max_num = max_dynamic_patch if dynamic_image_size else 1 + + # Build the image str / prompt based on the number of images we pass + prompt = "" * num_imgs + + for asset in image_assets: + for factor in size_factors: + image = rescale_image_size(asset.pil_image, factor) + mm_data = {"image": [image] * num_imgs} + + width, height = image.size + + # Calculate the expected number of blocks + if num_imgs == 1 and use_msac: + # First pass + blocks1, _, _, aspect_ratio = calculate_h2ovl_targets( + orig_width=width, + orig_height=height, + target_ratios=get_h2ovl_target_ratios( + min_num, + max_num, + prior_aspect_ratio=None, + ), + image_size=config.vision_config.image_size, + use_thumbnail=False, # Thumbnail is handled separately + ) + + # Second pass + blocks2, _, _, _ = calculate_h2ovl_targets( + orig_width=width, + orig_height=height, + target_ratios=get_h2ovl_target_ratios( + min_num, + max_num, + prior_aspect_ratio=aspect_ratio, + ), + image_size=config.vision_config.image_size, + use_thumbnail=False, + ) + + # Add thumbnail if use_thumbnail is True and total_blocks > 1 + if config.use_thumbnail: + blocks1 += 1 if blocks1 > 1 else 0 + blocks2 += 1 if blocks2 > 1 else 0 + + # Total blocks is the sum of blocks from both passes minus + # overlapping + total_blocks = blocks1 + blocks2 - 1 + + expected_num_patches = total_blocks + else: + blocks, _, _, _ = calculate_h2ovl_targets( + orig_width=width, + orig_height=height, + target_ratios=get_h2ovl_target_ratios( + min_num, + max_num, + prior_aspect_ratio=None, + ), + image_size=config.vision_config.image_size, + use_thumbnail=False, + ) + expected_num_patches = blocks + + if config.use_thumbnail and expected_num_patches != 1: + expected_num_patches += 1 + + processed_inputs = processor.apply(prompt, mm_data, + mm_processor_kwargs) + pixel_shape = ( + processed_inputs["mm_kwargs"]["pixel_values_flat"].shape) + + assert pixel_shape[0] == expected_num_patches * num_imgs diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py index 90fa9b01dac385f6b86b2725292af4c1cc71b5ca..ee3d45dccf1fe45b933b42df253cacee1b1848e2 100644 --- a/tests/models/multimodal/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -1,13 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for Idefics3's multimodal preprocessing kwargs.""" -from typing import Optional - import os import pytest -import torch -from transformers import AutoImageProcessor, AutoTokenizer +from transformers import Idefics3Config -from vllm.inputs import InputContext, token_inputs -from vllm.multimodal import MultiModalRegistry +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.utils import cached_get_tokenizer from ....conftest import _ImageAssets from ...utils import build_model_context @@ -16,163 +14,53 @@ from ....utils import models_path_prefix models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")] -# Wrap lazy imports to avoid initializing CUDA during test collection -@pytest.fixture() -def input_processor_for_idefics3(): - from vllm.model_executor.models.idefics3 import ( - input_processor_for_idefics3) - return input_processor_for_idefics3 - - -@pytest.fixture() -def dummy_data_for_idefics3(): - from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3 - return dummy_data_for_idefics3 - - -@pytest.fixture() -def get_max_idefics3_image_tokens(): - from vllm.model_executor.models.idefics3 import ( - get_max_idefics3_image_tokens) - return get_max_idefics3_image_tokens - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336]) -def test_input_mapper_override(model: str, image_assets: _ImageAssets, - longest_edge: Optional[int]): - """Ensure that the [default] input mapper handles size properly.""" - - mm_processor_kwargs = { - "size": { - "longest_edge": longest_edge - } - } if longest_edge is not None else {} - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=mm_processor_kwargs, - ) - - hf_processor = AutoImageProcessor.from_pretrained(model, - trust_remote_code=True, - **mm_processor_kwargs) - - mm_registry = MultiModalRegistry() - mm_registry.init_mm_limits_per_prompt(ctx.model_config) - - image = image_assets[0].pil_image - hf_result = hf_processor.preprocess( - image, - return_tensors="pt", - ) - - vllm_result = mm_registry.map_input( - ctx.model_config, - {"image": image}, - ) - - assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"]) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("longest_edge, expected_max_tokens", [ - (None, 2873), - (168, 169), - (336, 169), - (400, 338), - (672, 338), -]) -def test_max_tokens_override(get_max_idefics3_image_tokens, model: str, - longest_edge: Optional[int], - expected_max_tokens: int): - """Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs.""" - size = {"longest_edge": longest_edge} if longest_edge is not None else None - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=None, - ) - - actual_max_tokens = get_max_idefics3_image_tokens( - ctx=InputContext(ctx.model_config), - size=size, - ) - - assert expected_max_tokens == actual_max_tokens - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [ - (168, 169, 1), - (168, 169, 2), - (400, 338, 1), - (400, 338, 2), -]) -def test_dummy_data_override(dummy_data_for_idefics3, model: str, - longest_edge: int, toks_per_img: int, - num_imgs: int): - """Ensure dummy_data_for_idefics3 handles num_crops properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the dummy data func. - size = {"longest_edge": longest_edge} if longest_edge is not None else None - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=None, - ) - - dummy_data = dummy_data_for_idefics3( - ctx=ctx, - seq_len=8192, # Should be bigger than num_imgs * toks_per_img - mm_counts={"image": num_imgs}, - size=size) - sequence_data = dummy_data.seq_data - # Ensure we have the right number of placeholders per size - image_token_id = ctx.get_hf_config().image_token_id - img_tok_count = sequence_data.get_token_ids().count(image_token_id) - assert img_tok_count == toks_per_img * num_imgs - - @pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [ - (336, 169 * (1**2 + 1), 1), - (336, 169 * (1**2 + 1), 2), - (400, 169 * (2**2 + 1), 1), - (400, 169 * (2**2 + 1), 2), -]) -def test_input_processor_override(input_processor_for_idefics3, - image_assets: _ImageAssets, model: str, - longest_edge: int, - expected_toks_per_img: int, num_imgs: int): +# yapf: disable +@pytest.mark.parametrize( + ("mm_processor_kwargs", "expected_toks_per_img"), + [ + ({"size": {"longest_edge": 364}}, 169), + ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)), + ]) +# yapf: enable +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_override(image_assets: _ImageAssets, model: str, + mm_processor_kwargs: dict[str, object], + expected_toks_per_img: int, num_imgs: int): """Ensure input_processor_for_idefics3 handles num_crops properly.""" # Same as the previous test - don't initialize mm_processor_kwargs # in this test and assume that the kwargs will be correctly expanded by # the partial when calling the custom input processor. - size = {"longest_edge": longest_edge} if longest_edge is not None else None ctx = build_model_context( model_name=model, tokenizer_name=model, trust_remote_code=True, mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, ) + tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=tokenizer, + ) + hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs) # Build the image str / prompt based on the number of images we pass - tokenizer = AutoTokenizer.from_pretrained(model) placeholders = "" if num_imgs == 1 else "\n".join( f"Image-{i}: \n" for i in range(1, num_imgs + 1)) prompt = f"<|begin_of_text|>User:{placeholders}\n\nAssistant:" # noqa: E501 - images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs - - inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), - prompt=prompt, - multi_modal_data={"image": images}) - processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size) + # Build mm_data + image_size = ctx.get_hf_config(Idefics3Config).vision_config.image_size + dummy_image_size = (image_size * 4, image_size * 4) + dummy_image = image_assets[0].pil_image.resize(dummy_image_size) + mm_data = {"image": [dummy_image] * num_imgs} + + processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) + # Ensure the placeholders format are correct + hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"]) + assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[ + "input_ids"][0] # Ensure we have the right number of placeholders per num_crops size image_token_id = ctx.get_hf_config().image_token_id diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index 81d6e527498694b949c50afed219dcb799ef7bf2..f0fd8383b31318d91bf15613b5ba7890ed357476 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -1,208 +1,66 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for InternVL's multimodal preprocessing kwargs.""" -from typing import Callable, Optional +from typing import Optional import os import pytest -from transformers import AutoTokenizer -from vllm.inputs import InputContext, token_inputs -from vllm.multimodal import MultiModalRegistry +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.utils import cached_get_tokenizer from ....conftest import _ImageAssets from ...utils import build_model_context from ....utils import models_path_prefix -models = [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")] - -# Wrap lazy imports to avoid initializing CUDA during test collection -@pytest.fixture() -def input_processor_for_internvl(): - from vllm.model_executor.models.internvl import InternVLInputPipeline - - pipeline = InternVLInputPipeline('', '', '') - return pipeline.input_processor - - -@pytest.fixture() -def dummy_data_for_internvl(): - from vllm.model_executor.models.internvl import InternVLInputPipeline - - pipeline = InternVLInputPipeline('', '', '') - return pipeline.dummy_data - - -@pytest.fixture() -def get_max_internvl_image_tokens(): - from vllm.model_executor.models.internvl import ( - get_max_internvl_image_tokens) - return get_max_internvl_image_tokens - - -@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")]) @pytest.mark.parametrize("max_dynamic_patch", [1, 4]) @pytest.mark.parametrize("dynamic_image_size", [True, False, None]) -def test_input_mapper_override( - model: str, +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_override( + model_id: str, image_assets: _ImageAssets, max_dynamic_patch: int, dynamic_image_size: Optional[bool], -): - mm_processor_kwargs = { - "max_dynamic_patch": max_dynamic_patch, - } - if dynamic_image_size is not None: - mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size - - expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 - if dynamic_image_size is False: - expected_num_patches = 1 - - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=mm_processor_kwargs, - ) - - mm_registry = MultiModalRegistry() - mm_registry.init_mm_limits_per_prompt(ctx.model_config) - - image = image_assets[0].pil_image.resize((448 * 2, 448 * 2)) - vllm_result = mm_registry.map_input( - ctx.model_config, - {"image": image}, - ) - assert vllm_result["pixel_values"].size(1) == expected_num_patches - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None]) -@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) -def test_max_tokens_override( - get_max_internvl_image_tokens: Callable, - model: str, - max_dynamic_patch: Optional[int], - dynamic_image_size: Optional[bool], -): - """Ensure get_max_internvl_image_tokens handles mm_processor_kwargs.""" - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=None, - ) - - if max_dynamic_patch is None: - max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch - expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 - if dynamic_image_size is False: - expected_num_patches = 1 - expected_max_tokens = 256 * expected_num_patches - - actual_max_tokens = get_max_internvl_image_tokens( - ctx=InputContext(ctx.model_config), - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - ) - assert expected_max_tokens == actual_max_tokens - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("num_imgs", [1, 2]) -@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None]) -@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) -def test_dummy_data_override( - dummy_data_for_internvl: Callable, - model: str, num_imgs: int, - max_dynamic_patch: Optional[int], - dynamic_image_size: Optional[bool], ): - """Ensure dummy_data_for_internvl handles kwargs properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the dummy data func. ctx = build_model_context( - model_name=model, - tokenizer_name=model, + model_name=model_id, + tokenizer_name=model_id, trust_remote_code=True, mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, ) - - if max_dynamic_patch is None: - max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch - expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 - if dynamic_image_size is False: - expected_num_patches = 1 - expected_max_tokens = 256 * expected_num_patches - - dummy_data = dummy_data_for_internvl( - ctx=ctx, - seq_len=8192, # Should be bigger than num_imgs * toks_per_img - mm_counts={"image": num_imgs}, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, + tokenizer = cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=tokenizer, ) - sequence_data = dummy_data.seq_data - - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) - image_token_id = tokenizer.encode('', - add_special_tokens=False)[0] - # Ensure we have the right number of placeholders per size - img_tok_count = sequence_data.get_token_ids().count(image_token_id) - assert img_tok_count == expected_max_tokens * num_imgs + mm_processor_kwargs = { + "max_dynamic_patch": max_dynamic_patch, + } + if dynamic_image_size is not None: + mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size + # Build the image str / prompt based on the number of images we pass + prompt = "" * num_imgs + image = image_assets[0].pil_image.resize((448 * 2, 448 * 2)) + mm_data = {"image": [image] * num_imgs} -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("max_dynamic_patch", [1, 4]) -@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) -@pytest.mark.parametrize("num_imgs", [1, 2]) -def test_input_processor_override( - input_processor_for_internvl: Callable, - image_assets: _ImageAssets, - model: str, - num_imgs: int, - max_dynamic_patch: int, - dynamic_image_size: Optional[bool], -): - """Ensure input_processor_for_internvl handles kwargs properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the custom input processor. expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 if dynamic_image_size is False: expected_num_patches = 1 - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=None, - ) - expected_toks_per_img = 256 * expected_num_patches - - # Build the image str / prompt based on the number of images we pass - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) - placeholders = "" if num_imgs == 1 else "\n".join( - f"Image-{i}: \n" for i in range(1, num_imgs + 1)) - prompt = placeholders - images = [image_assets[0].pil_image.resize((448 * 2, 448 * 2))] * num_imgs - - inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), - prompt=prompt, - multi_modal_data={"image": images}) - - processed_inputs = input_processor_for_internvl( - ctx, - inputs, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - ) + processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) # Ensure we have the right number of placeholders per num_crops size - image_token_id = tokenizer.encode('', - add_special_tokens=False)[0] + image_token_id = tokenizer.convert_tokens_to_ids("") img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) - assert img_tok_count == expected_toks_per_img * num_imgs + pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape + + assert img_tok_count == 256 * expected_num_patches * num_imgs + assert pixel_shape[0] == expected_num_patches * num_imgs diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index 6de649f87204d44923f797f08d968fb6f4260889..fe4754c2ef6f6f07b62abae43bd76093dd5774ba 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from functools import partial @@ -41,7 +43,10 @@ def test_processor_max_tokens(model_id): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + tokenizer=cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ), ) info = processor.info @@ -141,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + tokenizer=cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ), ) image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), @@ -171,7 +179,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + tokenizer=cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ), ) seen_aspect_ratios = set[float]() diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index 806437d35ec8731dbabc42f28021d848f37e6aed..fb650d9e0995fe9b41ffe779be4bc538f7e7e478 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from functools import partial @@ -42,7 +44,10 @@ def test_processor_max_tokens(model_id): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + tokenizer=cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ), ) info = processor.info @@ -141,7 +146,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + tokenizer=cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ), ) image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), @@ -172,7 +180,10 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, - tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + tokenizer=cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ), ) seen_aspect_ratios = set[float]() diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py index 7f82a8f18f0ca97a72b588d26de722f112e86011..dde8904f2ef65a9fe7ddc95c50f4606f60d46262 100644 --- a/tests/models/multimodal/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for phi3v's multimodal preprocessing kwargs.""" import pytest @@ -37,7 +38,10 @@ def test_processor_override( trust_remote_code=True, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + tokenizer = cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, tokenizer=tokenizer, diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index de14fbbffe5b73e2e5aa79f5b33d28123b45d970..ef8e97f82d0bc38ce668836aaeab9055c12847c6 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.multimodal import MULTIMODAL_REGISTRY @@ -31,7 +33,10 @@ def test_processor_override( mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) + tokenizer = cached_get_tokenizer( + ctx.model_config.tokenizer, + trust_remote_code=ctx.model_config.trust_remote_code, + ) processor = MULTIMODAL_REGISTRY.create_processor( ctx.model_config, tokenizer=tokenizer, diff --git a/tests/models/registry.py b/tests/models/registry.py index 7952e65aa76a5d03d0275b82a1ebc7b4db508422..20787fe008aa891b42e7543c16fee975bf447ae2 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass, field from typing import AbstractSet, Any, Literal, Mapping, Optional @@ -222,8 +224,7 @@ _CROSS_ENCODER_EXAMPLE_MODELS = { _MULTIMODAL_EXAMPLE_MODELS = { # [Decoder-only] - "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria", - min_transformers_version="4.48"), + "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"), "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501 "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b", @@ -263,6 +264,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 + "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501 + min_transformers_version="4.49"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3", trust_remote_code=True), # [Encoder-decoder] @@ -276,7 +279,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { "MedusaModel": _HfExamplesInfo("JackFram/llama-68m", speculative_model="abhigoyal/vllm-medusa-llama-68m-random"), # noqa: E501 "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m", - speculative_model="ibm-fms/llama-160m-accelerator"), # noqa: E501 + speculative_model="ibm-ai-platform/llama-160m-accelerator"), # noqa: E501 +} + +_FALLBACK_MODEL = { + "TransformersModel": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501 } _EXAMPLE_MODELS = { @@ -285,6 +292,7 @@ _EXAMPLE_MODELS = { **_CROSS_ENCODER_EXAMPLE_MODELS, **_MULTIMODAL_EXAMPLE_MODELS, **_SPECULATIVE_DECODING_EXAMPLE_MODELS, + **_FALLBACK_MODEL, } diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index d3a3aaf670c23b6d8650b041c2bac3dca25edc45..64928a65d856551f2995acd2095373a1ef765adb 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import patch import pytest diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index 2c413a633896aa90e482280184fc28a81bc9eb47..f2a505596ce6923b136b0dc5d3a50a87623aa436 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest @@ -13,7 +15,9 @@ def test_plugin(dummy_opt_path): os.environ["VLLM_PLUGINS"] = "" with pytest.raises(Exception) as excinfo: LLM(model=dummy_opt_path, load_format="dummy") - assert "are not supported for now" in str(excinfo.value) + error_msg = "has no vLLM implementation and " \ + "the Transformers implementation is not compatible with vLLM." + assert (error_msg in str(excinfo.value)) @fork_new_process_for_each_test diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index ac0366847e3345351c8635895994c0ded3a18675..80d3f78f9f31734e60c9fe77c727923e6d00da9a 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import warnings import pytest diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py new file mode 100644 index 0000000000000000000000000000000000000000..1d5d9729df85bb5af8113abf491da292b0e422b7 --- /dev/null +++ b/tests/models/test_transformers.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Test the functionality of the Transformers backend. + +Run `pytest tests/models/test_transformers.py`. +""" +from contextlib import nullcontext +from typing import Type + +import pytest + +from ..conftest import HfRunner, VllmRunner +from ..utils import multi_gpu_test +from .utils import check_logprobs_close + + +def check_implementation( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + example_prompts: list[str], + model: str, + **kwargs, +): + max_tokens = 32 + num_logprobs = 5 + + with vllm_runner(model, **kwargs) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + + with hf_runner(model) as hf_model: + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize( + "model,model_impl", + [ + ("meta-llama/Llama-3.2-1B-Instruct", "transformers"), + ("openai-community/gpt2", "transformers"), + ("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE + ("meta-llama/Llama-3.2-1B-Instruct", "auto"), + ]) # trust_remote_code=True by default +def test_models(hf_runner, vllm_runner, example_prompts, model, + model_impl) -> None: + + maybe_raises = nullcontext() + if model == "openai-community/gpt2" and model_impl == "transformers": + # Model is not backend compatible + maybe_raises = pytest.raises( + ValueError, + match="The Transformers implementation.*not compatible with vLLM") + + with maybe_raises: + check_implementation(hf_runner, + vllm_runner, + example_prompts, + model, + model_impl=model_impl) + + +@multi_gpu_test(num_gpus=2) +def test_distributed( + hf_runner, + vllm_runner, + example_prompts, +): + kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2} + check_implementation(hf_runner, vllm_runner, example_prompts, + "meta-llama/Llama-3.2-1B-Instruct", **kwargs) diff --git a/tests/models/utils.py b/tests/models/utils.py index 0eb3f61f1f047937850f5c2dfc02e81bf778b62d..e2be43c1266714393bb426eec733746b03b577e4 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import warnings from typing import Dict, List, Optional, Sequence, Tuple, Union diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py index 59f9df470595ec79af3f06563534dd6b09c4e973..7bd9d12bd7b33684c03465c8fa00bc8b3c682de0 100644 --- a/tests/mq_llm_engine/test_abort.py +++ b/tests/mq_llm_engine/test_abort.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test that aborting is handled properly.""" import asyncio diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index 7801e92f2ae3e6201722efe5a56f40fcab5f7629..f488a67a51af2f68bc4490141de03549a6045408 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test that various errors are handled properly.""" import asyncio diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py index 60252e448d8aadbe3bb17ed0901307c73d983066..45be14f9ff259b8cc4a36d5b2165a15445493a88 100644 --- a/tests/mq_llm_engine/test_load.py +++ b/tests/mq_llm_engine/test_load.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test that the MQLLMEngine is able to handle 10k concurrent requests.""" import asyncio diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py index f717c1355431ccf75637351d4e3d2a39dfd199eb..11e44f12bc56f53a48358afe96e8256997de877e 100644 --- a/tests/mq_llm_engine/utils.py +++ b/tests/mq_llm_engine/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import multiprocessing from typing import Callable, Tuple, Union diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index 288abe2b6d6a63bc5f6161613ab08d993c2982fd..9be6ff64406eb2da8a2437e24fb3d46e7f01479c 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Test the AsyncLLMEngine with multi-step-decoding from typing import List, Optional diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index 0e347f629faef2cd83fb4c20feb28e2cc91a852c..2d57af8fcceecda20e4fc83ec3c53d5c78e54c72 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Test the LLMEngine with multi-step-decoding import copy diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py index 678bbb52b8c2f8b90c7a1a6023eafa01daa5e824..f5d3e282f953d602a9b643776f2853b4b6918a35 100644 --- a/tests/multimodal/test_inputs.py +++ b/tests/multimodal/test_inputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 13f820d013e2aa6683dcf36858507ff49ffec57b..459c0d9d113f223c47f4903ed7a396a646ee424f 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,9 +1,13 @@ +# SPDX-License-Identifier: Apache-2.0 + from contextlib import nullcontext +from types import MethodType from typing import cast from unittest.mock import MagicMock import numpy as np import pytest +from transformers import ProcessorMixin from vllm.config import ModelConfig from vllm.multimodal import MULTIMODAL_REGISTRY @@ -634,3 +638,70 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): mm_data=mm_data, hf_processor_mm_kwargs={}, ) + + +class _ProcessorProxy: + + def __init__(self, processor: ProcessorMixin) -> None: + super().__init__() + + self.__processor = processor + + def __getattr__(self, key: str): + return getattr(self.__processor, key) + + def __call__( + self, + text=None, + images=None, + videos=None, + exists=None, + return_tensors=None, + ): + return dict(exists=exists) + + +@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-7B-Instruct"]) # Dummy +# yapf: disable +@pytest.mark.parametrize( + ("call_kwargs", "expected_kwargs"), + [ + # Should ignore invalid kwargs + ({"does_not_exist": 100}, {"exists": None}), + ({"exists": 1}, {"exists": 1}), + ({"does_not_exist": 100, "exists": 1}, {"exists": 1}), + ], +) +# yapf: enable +def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs): + model_config = ModelConfig( + model=model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="half", + revision=None, + ) + + processor = MULTIMODAL_REGISTRY.create_processor( + model_config, + tokenizer=cached_get_tokenizer(model_config.tokenizer), + ) + orig_get_hf_processor = processor.info.get_hf_processor + + def get_hf_processor(self, **kwargs): + assert kwargs == call_kwargs + return _ProcessorProxy(orig_get_hf_processor()) + + processor.info.get_hf_processor = MethodType(get_hf_processor, + processor.info) + + out_kwargs = processor._call_hf_processor( + prompt="", + mm_data={}, + mm_kwargs=call_kwargs, + ) + + assert out_kwargs == expected_kwargs diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 531ed1d0cc994a8e41522dd71d2894200e1290ae..8088a41d591fecdcdc1c1323f41cda47d44e35c0 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import base64 import mimetypes import os diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py index 29aeca605109b3c02d9df5eba30be519227b2e19..9a336b7e60ffc5c8b2ca5f1aac71cac3867f2e16 100644 --- a/tests/multimodal/utils.py +++ b/tests/multimodal/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import numpy as np from PIL import Image diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py index 77b707a73711871d2e73dd8ed3fd1607dc1d11b0..dfbcfc15e2327484e83cc564fa53c8e71cdac1c3 100644 --- a/tests/neuron/test_prefix_prefill.py +++ b/tests/neuron/test_prefix_prefill.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import Optional diff --git a/tests/plugins/vllm_add_dummy_model/setup.py b/tests/plugins/vllm_add_dummy_model/setup.py index 9b535127f1df14ef998e411ae5b8080de25a81ff..e3fb6efb275761e2473cb123416a421f859eb605 100644 --- a/tests/plugins/vllm_add_dummy_model/setup.py +++ b/tests/plugins/vllm_add_dummy_model/setup.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from setuptools import setup setup(name='vllm_add_dummy_model', diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py index 62a8f871fa51b7d0e99f287b6aabfeef74be6c70..0c431cb39737b1325ab72621afe5ce9d92f88eac 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import ModelRegistry diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index 5e7d7d1877e61629f36984e4ef77d4defd5236e7..3af62b2885e5720e51d48be23aa73ce5a4854181 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Iterable, List, Optional, Tuple, Union import torch diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py index ac64edfd4ec9d95782a7a073c67dc0f4ad2c9ac8..c23ab64308f204483dd8805296e8c5335f91838b 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional import torch diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py index 569ef216c9f0a1f697bf433b359bd5cf75be4886..bbd11ed4aac9d816bd56949906a67f06ee1ad53b 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional import torch diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py index 31639906898db5eda1663bcff3ff5d3824449fff..10df0b5e050350bf21c26d543535308cecc968f6 100644 --- a/tests/plugins/vllm_add_dummy_platform/setup.py +++ b/tests/plugins/vllm_add_dummy_platform/setup.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from setuptools import setup setup( diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py index 594cef520a7ded350b2a86442073cbd58709d630..0d1b062ac2eb549d6fe010a9a444792b45e6b7fd 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py index 5634be3c8d882f7ee59e1f78127897927f6cfca5..33425bbc11ed912900118774cdb6c151df91f823 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.attention.backends.flash_attn import FlashAttentionBackend diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py index d7c6bdd707eb70746481eacf04348d813294e187..5cefafc7e06c7d51d4a3fe60042685174f29b508 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.platforms.cuda import CudaPlatform diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 661aa5f649ab9169ca8d24277108ce2a5d18e3d4..ed50fe53501490195989b4a76e85f32c0267f322 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from tests.kernels.utils import override_backend_env_variable diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index ee952f6f618966bb8670157eff6275fcef0c99a5..efaa25f8b12d1ba72aaba4a9b0d11278c450a57c 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the with and without prefix caching. Run `pytest tests/prefix_caching/test_prefix_caching.py`. diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 0b9d3db086c94ebc2171772078c939039a39abd7..bc21367e9d522bac3ac9fdcdc082416ae30f40d2 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the with and without prefix caching. Run `pytest tests/prefix_caching/test_prefix_caching.py`. diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py index 683de76282e37ab4d33d0de08e0c9679b5131b4a..5e9b079dde9b053964f5b8b5873cb8ba0bfc6c7a 100644 --- a/tests/prompt_adapter/test_bloom.py +++ b/tests/prompt_adapter/test_bloom.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import os diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py index 1c8b47548e8bfb5cae7e1b3e27e8cf88f47360d7..6f900513dee42bc484f82e99c262ba95271c407d 100644 --- a/tests/prompt_adapter/test_multi_adapter_inference.py +++ b/tests/prompt_adapter/test_multi_adapter_inference.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import EngineArgs, LLMEngine, SamplingParams from vllm.prompt_adapter.request import PromptAdapterRequest from ..utils import models_path_prefix diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py index 75460f072fe7c8edb469c9db8da3a8cdd8c96ead..1c9710ccd378e9e3a3fc42d9440b7705e6774a57 100644 --- a/tests/prompt_adapter/test_pa_lora.py +++ b/tests/prompt_adapter/test_pa_lora.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from huggingface_hub import snapshot_download from vllm import EngineArgs, LLMEngine, SamplingParams diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 9462e0ca601ef4d38d4510bc4c67260f72857e24..a5f5c62533e6f8f13ae60cf4d3f60bd4384a53fc 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 '''Tests whether bitsandbytes computation is enabled correctly. Run `pytest tests/quantization/test_bitsandbytes.py`. diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 15146b5c05dca47c4c7979dff649078a08f7032f..2e813c8a87a8556fdbd066355b0f3f25df1518c8 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -1,7 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 """Test model set-up and weight loading for llmcompressor-quantized models. Run `pytest tests/quantization/test_compressed_tensors.py`. """ + from typing import Optional import pytest @@ -24,12 +26,30 @@ from ..utils import models_path_prefix @pytest.mark.parametrize( "model_args", - [(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), "tensor", - QuantizationType.INT, 2560, True), - (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"), "channel", - QuantizationType.INT, 2560, True), - (os.path.join(models_path_prefix, "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama"), "tensor", - QuantizationType.INT, 2560, False)]) + [ + ( + os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), + "tensor", + QuantizationType.INT, + 2560, + True, + ), + ( + os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"), + "channel", + QuantizationType.INT, + 2560, + True, + ), + ( + os.path.join(models_path_prefix, "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama"), + "tensor", + QuantizationType.INT, + 2560, + False, + ), + ], +) def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): model_path, strategy, quant_type, shape_0, is_symmetric = model_args with vllm_runner(model_path, enforce_eager=True) as llm: @@ -87,21 +107,31 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): assert output -@pytest.mark.parametrize("model_path", [ - os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-quantized.w8a8"), - os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym"), - os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym"), - os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym") -]) +@pytest.mark.parametrize( + "model_path", + [ + os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-quantized.w8a8"), + os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym"), + os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym"), + os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"), + ], +) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [10]) -def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner, - example_prompts, model_path, - max_tokens, num_logprobs): +def test_compressed_tensors_w8a8_logprobs( + hf_runner, + vllm_runner, + example_prompts, + model_path, + max_tokens, + num_logprobs, +): dtype = "bfloat16" # skip language translation prompt for the static per tensor asym model - if model_path == os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"): # noqa: E501 + if (model_path == + os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym") + ): # noqa: E501 example_prompts = example_prompts[0:-1] with hf_runner(model_path, dtype=dtype) as hf_model: @@ -127,13 +157,21 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner): assert output -@pytest.mark.parametrize("model_args", [ - (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"), "tensor"), - (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym"), "tensor"), - (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"), "channel"), - (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym"), - "channel"), -]) +@pytest.mark.parametrize( + "model_args", + [ + (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"), "tensor"), + (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym"), "tensor"), + ( + os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"), + "channel", + ), + ( + os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym"), + "channel", + ), + ], +) def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): model_path, strategy = model_args with vllm_runner(model_path, dtype=torch.float16) as llm: @@ -160,9 +198,12 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): reason="WNA16 is not supported on ROCm.") @pytest.mark.parametrize( "wNa16_args", - [(os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w4a16-channel-v2"), "channel", None, 8), + [ + (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w4a16-channel-v2"), "channel", None, 8), (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w4a16-group128-v2"), "group", 128, 8), - (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a16-per-channel"), "channel", None, 4)]) + (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a16-per-channel"), "channel", None, 4), + ], +) def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): model, strategy, group, pack_factor = wNa16_args with vllm_runner(model) as llm: @@ -226,7 +267,8 @@ def test_compressed_tensors_fp8(vllm_runner): CompressedTensorsLinearMethod) assert isinstance( qkv_proj.scheme, - (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) + (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8), + ) assert qkv_proj.input_scale.dtype is torch.float32 @@ -251,9 +293,14 @@ def test_compressed_tensors_kv_cache(vllm_runner): assert output -@pytest.mark.skipif(not sparse_cutlass_supported(), - reason="Sparse FP8 is not yet supported on this GPU type.") -def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy): +@pytest.mark.skipif( + not sparse_cutlass_supported(), + reason="Sparse FP8 is not yet supported on this GPU type.", +) +def _test_2of4_quant_models(qkv_proj, + weight_strategy, + input_strategy, + format="dense"): assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(qkv_proj.scheme, CompressedTensors24) @@ -262,22 +309,39 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy): assert qkv_proj.scheme.quantized assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 - assert sparsity_map.get("Linear").format == "dense" + assert sparsity_map.get("Linear").format == format assert sparsity_map.get("Linear").sparsity_structure == "2:4" -@pytest.mark.skipif(not current_platform.has_device_capability(90), - reason="Sparse FP8 is not yet supported on this GPU type.") -@pytest.mark.parametrize("args_2of4", [ - ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel", - "token"), - ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing", - "channel", "tensor"), - ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor", - "tensor"), - ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing", - "tensor", "token"), -]) +@pytest.mark.skipif( + not current_platform.has_device_capability(90), + reason="Sparse FP8 is not yet supported on this GPU type.", +) +@pytest.mark.parametrize( + "args_2of4", + [ + ( + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", + "channel", + "token", + ), + ( + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing", + "channel", + "tensor", + ), + ( + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", + "tensor", + "tensor", + ), + ( + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing", + "tensor", + "token", + ), + ], +) def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: @@ -296,16 +360,134 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): assert output -@pytest.mark.skipif(not sparse_cutlass_supported(), - reason="Sparse FP8 is not yet supported on this GPU type.") -@pytest.mark.parametrize("args_2of4", [ - ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing", - "channel", "token"), - ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor", - "tensor"), - ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing", - "tensor", "token"), -]) +@pytest.mark.skipif( + not current_platform.has_device_capability(90), + reason="Sparse FP8 is not yet supported on this GPU type.", +) +@pytest.mark.parametrize( + "args_2of4", + [ + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM", + "channel", + "token", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM", + "channel", + "tensor", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM", + "tensor", + "token", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM", + "tensor", + "tensor", + ), + ], +) +def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4): + model, weight_strategy, input_strategy = args_2of4 + with vllm_runner(model) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn + _test_2of4_quant_models( + qkv_proj, + weight_strategy, + input_strategy, + format="sparse-24-bitmask", + ) + + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + print(output) + assert output + + +@pytest.mark.skipif( + not sparse_cutlass_supported(), + reason="cutlass is not yet supported on this GPU type.", +) +@pytest.mark.parametrize( + "args_2of4", + [ + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM", + "channel", + "token", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM", + "channel", + "tensor", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM", + "tensor", + "token", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM", + "tensor", + "tensor", + ), + ], +) +def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4): + model, weight_strategy, input_strategy = args_2of4 + with vllm_runner(model) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.int8 + _test_2of4_quant_models( + qkv_proj, + weight_strategy, + input_strategy, + format="sparse-24-bitmask", + ) + + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + print(output) + assert output + + +@pytest.mark.skipif( + not sparse_cutlass_supported(), + reason="Sparse FP8 is not yet supported on this GPU type.", +) +@pytest.mark.parametrize( + "args_2of4", + [ + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing", + "channel", + "token", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", + "tensor", + "tensor", + ), + ( + "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing", + "tensor", + "token", + ), + ], +) def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: @@ -327,10 +509,12 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.") @pytest.mark.skipif( not sparse_cutlass_supported(), - reason="2of4 Sparse is not yet supported on this GPU type.") + reason="2of4 Sparse is not yet supported on this GPU type.", +) @pytest.mark.parametrize( "args_2of4", - [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")]) + [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")], +) def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): model = args_2of4 with vllm_runner(model) as llm: @@ -347,7 +531,9 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): assert qkv_proj.scheme.input_quant is None assert not qkv_proj.scheme.quantized assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map - sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 + sparsity_map = ( + qkv_proj.quant_method.quantization_config.sparsity_scheme_map + ) # noqa: E501 assert sparsity_map.get("Linear").format == "dense" assert sparsity_map.get("Linear").sparsity_structure == "2:4" @@ -356,3 +542,38 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) assert output + + +@pytest.mark.skipif( + not sparse_cutlass_supported(), + reason="Cutlass is not yet supported on this GPU type.", +) +@pytest.mark.parametrize( + "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")]) +def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4): + model = args_2of4 + with vllm_runner(model) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensors24) + + assert qkv_proj.scheme.weight_quant is None + assert qkv_proj.scheme.input_quant is None + assert not qkv_proj.scheme.quantized + assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map + sparsity_map = ( + qkv_proj.quant_method.quantization_config.sparsity_scheme_map + ) # noqa: E501 + assert sparsity_map.get("Linear").format == "sparse-24-bitmask" + assert sparsity_map.get("Linear").sparsity_structure == "2:4" + + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + print(output) + assert output diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 99a5561f11c290b336921407fb696e7cce581cf7..baf8397eae551306b303384b44aa76cf1c8f8834 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests whether Marlin models can be loaded from the autogptq config. Run `pytest tests/quantization/test_configs.py --forked`. diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py index 8a20e668c542bcfb050f1b4109d6c1fd2fb95147..f6bd1a04bc37c284faaef088e424d11e2e65da26 100644 --- a/tests/quantization/test_cpu_offload.py +++ b/tests/quantization/test_cpu_offload.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Expanded quantized model tests for CPU offloading # Base tests: tests/basic_correctness/test_cpu_offload.py diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py index 3e7c60d1144e5a74235a6544ead1536e81739f92..11d170554fb1c46ba86b6e8406e131398a78fcb9 100644 --- a/tests/quantization/test_experts_int8.py +++ b/tests/quantization/test_experts_int8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # flake8: noqa """Tests experts_int8 quantization startup and generation, doesn't test correctness diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 8ec2643a40c82d938984cbfb542c39543069e52c..eff4b1049ef8e4ef1076c89aacc780918d202833 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests whether FP8 computation is enabled correctly. Run `pytest tests/quantization/test_fp8.py --forked`. diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py index 2179186a3bcdf7c5a9210dcd7a995d9d40d69b18..3b88db3921ffc755f2444f2f821e6e102d1d921b 100644 --- a/tests/quantization/test_ipex_quant.py +++ b/tests/quantization/test_ipex_quant.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test model set-up and inference for quantized HF models supported on the CPU/GPU backend using IPEX (including AWQ/GPTQ). diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index 9b7c2fd0fcb2379e71b1548dfbeefd5bc126f727..e123faa48223cd161b43abfcf2dfbe27fddf45d9 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests whether gptq models with quantized lm_head can be loaded. Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`. diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 11382ad708faa89da1447ce4c613ab5dc0f02ac6..491370c7cc24d243586f8dd76d8ca3e40b4b0996 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test model set-up and weight loading for quark-quantized models. Run `pytest tests/quantization/test_quark.py`. diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 8e7f44a399ddf547b0d5da8b3752508f1660a16e..9e1867f913e98c1708163063aee2706c006bf725 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests register custom quantization config. See https://github.com/vllm-project/vllm/issues/11926 for more details. diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py index 8ebd8dd2be0d5b5aad974381b3e0e2dbd1f6db83..7a339c162cc48a45a8cde13c8c5f77f870169628 100644 --- a/tests/quantization/utils.py +++ b/tests/quantization/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.model_executor.layers.quantization import get_quantization_config from vllm.platforms import current_platform diff --git a/tests/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/runai_model_streamer/test_runai_model_streamer_loader.py index c5722fbae5c8a0780587289d6078e8c689904368..aa91fa8e1c1c68264625d73ac2fee698e9724f9d 100644 --- a/tests/runai_model_streamer/test_runai_model_streamer_loader.py +++ b/tests/runai_model_streamer/test_runai_model_streamer_loader.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import SamplingParams from vllm.config import LoadConfig, LoadFormat from vllm.model_executor.model_loader.loader import (RunaiModelStreamerLoader, diff --git a/tests/runai_model_streamer/test_weight_utils.py b/tests/runai_model_streamer/test_weight_utils.py index 5c89bd78ad81dd9cd8ed7fd14948e534aeeaba81..4afa76c51693fc44c3f2fc192454c421433658a9 100644 --- a/tests/runai_model_streamer/test_weight_utils.py +++ b/tests/runai_model_streamer/test_weight_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import glob import tempfile diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index e43f6683f305c1c5e3798c432aa611c6e1bff2d5..1664dbe3cf0778642669b271e7338a1c85646869 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the outputs of HF and vLLM when using beam search. Run `pytest tests/samplers/test_beam_search.py`. diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 95d53cae517613ea04d59954bfbf52c0f925ad18..22487c54ba09c63f70f2826b0fee76ec065cdc93 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Make sure ignore_eos works. Run `pytest tests/samplers/test_ignore_eos.py`. diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index cb458def8ff71bdf237f504c46b2cb58264894ce..1a722ac0845f3d94cd0fe7c6efd666ef853466f0 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch import os diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 5b2e6584fa1eb8082c75cc1c90d4239afee80f05..9e9d81692acae66f9d6d6672bbeb8b7839631233 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 4f9511ef571132ab5cae005e7e71d8595bb99d49..b00fec2dbaeb42951e848bdd917caab1eeb02cc9 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Make sure bad_words works. Run `pytest tests/samplers/test_no_bad_words.py`. diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index dde83b69f6c1aa8dbd09e39648d6bc9b2054152c..0361a6af91ac997c62d644e0824432e31a574e00 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import os diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 4e25beeba450a0d0f4973639bdd96c0bc410ecad..1e695e0190be34a54079fca73029fc2e596b7cf4 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for rejection sampling.""" from typing import List, Tuple diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 28c34064f670c4777b2416876503d32a7c2a9779..ca09e536a06ca57d431546e27d7ee04b24504a28 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools import random from dataclasses import dataclass diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index d380f340b679382ab4211a246c46ec032338b6f2..e1c40dc10e1dbf3632ee2c31f9adbf5b685add79 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Verify that seeded random sampling is deterministic. Run `pytest tests/samplers/test_seeded_generate.py`. diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py index 4ddad66dce1fb203df546f065a60aeb009658d16..ecf98179ca21a88ed8421e26e58ebd2c572590f4 100644 --- a/tests/samplers/test_typical_acceptance_sampler.py +++ b/tests/samplers/test_typical_acceptance_sampler.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for rejection sampling.""" import pytest diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index 5cb982a0811c7e2212a244b68173056cc8d34184..53c888816a6c1df909c7331a557521abe427501d 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import cycle from typing import List, Optional, Sequence, Tuple, Union diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index 4ef6f4c693e5c1e0e79f55587e8b1df53534fa1e..82d4413b66a09d99dccc5370e6362f81fc93fea5 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import os diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py index e4d2f4f5657b2b6780a986b0a0b99e00987dc76d..de6510c93021930ad4b8d3328ae2e8ee41e2b1aa 100644 --- a/tests/spec_decode/e2e/test_eagle_correctness.py +++ b/tests/spec_decode/e2e/test_eagle_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py index 7c6ea1a307d002c5e2e8c423244ea106669b9d22..5c5bd4ad2d455469f35fd21e5de981e248f6fc69 100644 --- a/tests/spec_decode/e2e/test_integration.py +++ b/tests/spec_decode/e2e/test_integration.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests which cover integration of the speculative decoding framework with other features, e.g. cuda graphs. """ diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py index 8683ae032b2dab1223af1f6883e76bb9038428c1..4839ab10933321be84d089b0e38114deae7516eb 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests which cover integration of the speculative decoding framework with tensor parallelism. """ diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py index 8f4809512dc26e5c6c86b2cdaebe0c51be608ce3..871d8f26b54bb0015f8277038c14aceb0b6c6d64 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests which cover integration of the speculative decoding framework with tensor parallelism. """ diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index ab8b651ae76d0e9577b76e3f4511016efcd6c8e2..dfca7afc503bca561a5a0486038b3759d009376f 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import cycle import pytest diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py index 8ab9132baa149ab6c8066ca9e2a07c6a41d2007d..4b7380801faec74dfb55f7cfa613aa59cfc48fe8 100644 --- a/tests/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/spec_decode/e2e/test_medusa_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py index 4ea4ebd10d750f6b3589c2c980b063e138d92c8f..91582cbc639b1bf0b5287f877e5cf80cb2d63153 100644 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index e5aff6fa121f84672e284a1496b4d9a8eb84506d..2cc73669591a45af176bf903816df2d5b5413ed0 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """The tests in this file verify end-to-end speculative decoding correctness. This docstring details important information on the testing methodology. diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index 4bfe4f2f327b1cc2bfc7bc7ffe3b21a33bbd2904..61cec58c232886fd59662f086011288ec22aad29 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py index 7be76f97554122d832be3c7acaecb27343345d7d..01a3210e5a74bc5f2f1607556a644c33ba23c6fa 100644 --- a/tests/spec_decode/e2e/test_seed.py +++ b/tests/spec_decode/e2e/test_seed.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import os diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 3504fcf43e36104c675eb63b32e80930fb1bfe69..fe95ff9b9c35a8f05d18075dfc1ada5914aedbf3 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py index aa49a3aee62aa6aca65dc5be4ed660a764066e14..0bff0ea1d7dba513215b774bfc90057c66415e6d 100644 --- a/tests/spec_decode/test_dynamic_spec_decode.py +++ b/tests/spec_decode/test_dynamic_spec_decode.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import MagicMock, patch import pytest diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index 7477486a3388d7aa01727c5fc4a5c2ed249f2776..1a6693e168173b9ea46fb274cb2b3d25c1885e91 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from unittest.mock import MagicMock diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index cb4f06c482de934e99488d4e163666788dfb37e4..8d7bb7be27cba82d13aa8e761db21503635bc4a1 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import Dict, List from unittest.mock import MagicMock diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py index 65b4acb27df790a34616a7f92b92c393dec2d679..bd6269af6fa917c19498fc6fdddf4ceceb6c6249 100644 --- a/tests/spec_decode/test_ngram_worker.py +++ b/tests/spec_decode/test_ngram_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import os diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py index 70827344c712966402cf1ad6d022e4a29b4aaf01..97d45ea5cf94b08f76481ea32adb88f21ed3c34e 100644 --- a/tests/spec_decode/test_scorer.py +++ b/tests/spec_decode/test_scorer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import List diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index d8c3af4c1cd1ef744f69a371d08ad00d2440a425..eee0f4c89c8987e60228b9a01fad397f2acbffa2 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from collections import defaultdict from types import SimpleNamespace diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py index 195fce64822bd2154afb11ea2a5a8f1d844297db..24573e22487d01d8b78d1d4541d27df2dac935aa 100644 --- a/tests/spec_decode/test_utils.py +++ b/tests/spec_decode/test_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from unittest.mock import MagicMock import pytest diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index 2f883c2ff9b7aa6572c47ab84bed9c8a04acaee9..38f57e99bdb0df9e90f55d6c9664c9cf4010da49 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from itertools import count from typing import Callable, Dict, List, Optional from typing import Sequence as GenericSequence diff --git a/tests/standalone_tests/lazy_torch_compile.py b/tests/standalone_tests/lazy_imports.py similarity index 53% rename from tests/standalone_tests/lazy_torch_compile.py rename to tests/standalone_tests/lazy_imports.py index b950877a4337b24597d9a636056291b0090edfe7..61e3b387973bc66240579f529cf57becc453e30a 100644 --- a/tests/standalone_tests/lazy_torch_compile.py +++ b/tests/standalone_tests/lazy_imports.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Description: Test the lazy import module # The utility function cannot be placed in `vllm.utils` # this needs to be a standalone script @@ -6,7 +8,17 @@ from contextlib import nullcontext from vllm_test_utils import BlameResult, blame -module_name = "torch._inductor.async_compile" +# List of modules that should not be imported too early. +# Lazy import `torch._inductor.async_compile` to avoid creating +# too many processes before we set the number of compiler threads. +# Lazy import `cv2` to avoid bothering users who only use text models. +# `cv2` can easily mess up the environment. +module_names = ["torch._inductor.async_compile", "cv2"] + + +def any_module_imported(): + return any(module_name in sys.modules for module_name in module_names) + # In CI, we only check finally if the module is imported. # If it is indeed imported, we can rerun the test with `use_blame=True`, @@ -14,8 +26,7 @@ module_name = "torch._inductor.async_compile" # and help find the root cause. # We don't run it in CI by default because it is slow. use_blame = False -context = blame( - lambda: module_name in sys.modules) if use_blame else nullcontext() +context = blame(any_module_imported) if use_blame else nullcontext() with context as result: import vllm # noqa @@ -23,6 +34,6 @@ if use_blame: assert isinstance(result, BlameResult) print(f"the first import location is:\n{result.trace_stack}") -assert module_name not in sys.modules, ( - f"Module {module_name} is imported. To see the first" +assert not any_module_imported(), ( + f"Some the modules in {module_names} are imported. To see the first" f" import location, run the test with `use_blame=True`.") diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py index 2a456536224480c23926ca5c8c6c01d35c02dd2d..694bb5fbc3f7172e31f1439cfd625b2397333cb7 100644 --- a/tests/tensorizer_loader/conftest.py +++ b/tests/tensorizer_loader/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import functools import gc from typing import Callable, TypeVar diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 62137140d76b7a313d6259c9419098ca6e677f51..58f0ba0a947c0e83aff946386b6e7326e204bb86 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import gc import json import os diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 09ca27e5af28d15f62a9d6e813b5b59f915d3a9a..c11a0ce3962bbb60aa7a00e700445352c498bcc0 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test hashing of cache blocks. Run `pytest tests/test_cache_block_hashing.py`. diff --git a/tests/test_config.py b/tests/test_config.py index 39f82f0773dfaa6505d1ad7f25b4735187d19104..f692569f075eec30cee52a95300eca1003e622b8 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import asdict import pytest diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py index ffeacf34b7baf02d876e5d5e79db823be575fc31..a9b4f5cbf78c3a68d229bbf9a8a57a123a5366ec 100644 --- a/tests/test_embedded_commit.py +++ b/tests/test_embedded_commit.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import vllm diff --git a/tests/test_inputs.py b/tests/test_inputs.py index fff7c5fc04285dbd7a72f6d540a6b8eb881293a7..fff909154a2aed5e21807b4d55abda6d9ed8ce79 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest diff --git a/tests/test_logger.py b/tests/test_logger.py index e3749616d420312266f5c8a1dcad8fc3239a4db6..993822e9224057e2177749e24adf1b424c693a91 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import logging import os diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py index 39c1c38151fd0a74d28734ac8e2c3e6e19a0e1cd..487fbb8fcb8c8b36935c89b251551629c6d20199 100644 --- a/tests/test_logits_processor.py +++ b/tests/test_logits_processor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import random from typing import Tuple from unittest.mock import patch diff --git a/tests/test_regression.py b/tests/test_regression.py index b743bcc155aa433dd898c2aed870a0e170655e73..879f437a3e85126eb06027dac35ec73f8d73b0a6 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Containing tests that check for regressions in vLLM's behavior. It should include tests that are reported by users and making sure they diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py index 01cbe0c997f298a9cf86d342244f9122e38cdcc2..40e26ed5199c1c0d7faa72ca90f6b682a876b91f 100644 --- a/tests/test_sampling_params.py +++ b/tests/test_sampling_params.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests for the SamplingParams class. """ from vllm import SamplingParams diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py index a9221f08c294645dc090c76cdf19490826da0088..6e36f2c337f325543ba097eab94bdc7324662b87 100644 --- a/tests/test_scalartype.py +++ b/tests/test_scalartype.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import torch diff --git a/tests/test_sequence.py b/tests/test_sequence.py index 30e53a180ea310c4dddc2133baaee467040317be..902de1099e6051bc56b4775893ffa52e6b224f93 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.model_executor.layers.sampler import SamplerOutput diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index 69f3eaf8b79d80368df5df06fe1020122384bafc..c23f1b7be387fa1a3784be7fd7403f86a61e0db5 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import multiprocessing as mp import os import shutil diff --git a/tests/test_utils.py b/tests/test_utils.py index 7eb4ccffcbf08a157a0e6154bec80bdb3beb497b..64df0e3b8b41f49a76224744e094c436c52ba40f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os import socket diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py index e17ffa86774f18ff94135902cefad68f2b2bd6e5..576a8fa482b124d346823c3c7347d5874ee20e19 100644 --- a/tests/tokenization/test_cached_tokenizer.py +++ b/tests/tokenization/test_cached_tokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from copy import deepcopy import os diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index c2775ea4557e6190997226feb3becb716e19b048..ec06a703c96564295c101bc37bef4286587a125e 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, Generator, List, Optional import pytest diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py index 22f7a80fb27ec6db790cef1186087249e2acdd5a..1a38dc8e372f0e16fb7d955304b099b6389494fb 100644 --- a/tests/tokenization/test_get_eos.py +++ b/tests/tokenization/test_get_eos.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This test file includes some cases where it is inappropriate to only get the `eos_token_id` from the tokenizer as defined by diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py index b2790fa072b83ab2aa60cd7134ebab26b0060382..8379091f8e47beef2c691e3b45dda07e593864a0 100644 --- a/tests/tokenization/test_tokenizer.py +++ b/tests/tokenization/test_tokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import os from transformers import PreTrainedTokenizerBase diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index 3d548ba11a1d8c9bb3c5a6a7a0e28eea0d47325b..788d5d54402ceba4deba83295d9246ae2b6b9d37 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os import sys diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py index 530ef0bf23bc8dfe63280575b8110b6a07abcfe6..ba7dce4b61e293b955245e45a7e198e037b1e04d 100644 --- a/tests/tool_use/conftest.py +++ b/tests/tool_use/conftest.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest import pytest_asyncio # from huggingface_hub import snapshot_download diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py index f50b68bb264b46afdbeaf1c8c2779476e05964d3..0185103b609beaefcb07ea894a794fe1b9a6b540 100644 --- a/tests/tool_use/test_chat_completion_request_validations.py +++ b/tests/tool_use/test_chat_completion_request_validations.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py index 75bbfbb766931b38617f782fa22e683b3f6a59a2..da033fa1d85c3694d99f47dc060dba7d9e345d80 100644 --- a/tests/tool_use/test_chat_completions.py +++ b/tests/tool_use/test_chat_completions.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import openai diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py index 9e6176dcdf9e32166794f414221bde3d53ec5af5..1060a57a0fb36c856d09861566a53988ef906640 100644 --- a/tests/tool_use/test_jamba_tool_parser.py +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from typing import Generator, List, Optional diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py index c294cb04919fafd5943d3a9040ce2b9430f187db..b49a5e8e7e4c7a0c474d28818e86add84dafde78 100644 --- a/tests/tool_use/test_parallel_tool_calls.py +++ b/tests/tool_use/test_parallel_tool_calls.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from typing import Dict, List, Optional diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py index fe8cb496c974151166da3bc97cc8ca9461d43094..45f1bfc45bd78ad15e6b5ab1c91af12f0be0a3db 100644 --- a/tests/tool_use/test_tool_calls.py +++ b/tests/tool_use/test_tool_calls.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from typing import Dict, List, Optional diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index 5dd9e643c875b94891e718b405edec056d81e34d..d6df85dcb5b7434c012e3d33711b592d8eadb7ad 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from copy import deepcopy from typing import Any, Dict, List, Optional import os diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py index 6cd5615c44e1e0ad641cf774f49dd4310bba42fc..3db9bc73aa8761748c86305cf5f8d88cb50924e9 100644 --- a/tests/tpu/test_quantization_accuracy.py +++ b/tests/tpu/test_quantization_accuracy.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass import lm_eval diff --git a/tests/tpu/untest_compilation.py b/tests/tpu/untest_compilation.py index b7124ebc1b0f39f24289cee315da3b0a74f6608f..6ed83f30ee026ff3e0f5e79de855f4d88c5bcf2d 100644 --- a/tests/tpu/untest_compilation.py +++ b/tests/tpu/untest_compilation.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import glob import os import tempfile diff --git a/tests/tpu/untest_custom_dispatcher.py b/tests/tpu/untest_custom_dispatcher.py index 48ff6fddded1a47778ba5ca450aa71a7b624e5d2..706b7a582f3e55eafc6ba0d99cdbcf2c5475a039 100644 --- a/tests/tpu/untest_custom_dispatcher.py +++ b/tests/tpu/untest_custom_dispatcher.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from vllm.config import CompilationLevel diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index b73cc168a4fb76cf6d33e108a5c3f5eb54003e80..b8812c7a4e733560a38c610064b4eb6567e45489 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import threading from concurrent import futures diff --git a/tests/utils.py b/tests/utils.py index 17f558b9a35d43c2da4fb90a21bcd9d55ddb388a..827804fad92596cdf5675006345d9150d54bce92 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import copy import functools diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 0a5ba1f98221f430259cbcb269c64772e0fd6428..60cf4384d3fde0c6bec31f50e4be3fdad7c4a3a9 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.multimodal.inputs import MultiModalKwargs diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index f434fa8c61a801d521b443e0ae9869da9f739f6a..a6c0162d3f30891f8cc582bc2953afbe9f4691e3 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Compare the with and without prefix caching.""" import pytest @@ -164,7 +165,7 @@ def test_decode(): req0.num_computed_tokens = 55 for _ in range(4): req0.append_output_token_ids(8) - new_blocks = manager.append_slots(req0, 4) + new_blocks = manager.allocate_slots(req0, 4) assert new_blocks is not None and len(new_blocks) == 0 assert manager.req_to_blocks[req0.request_id][-2].block_hash is None @@ -175,7 +176,7 @@ def test_decode(): # the preallocated block. for _ in range(5 + 10): req0.append_output_token_ids(7) - new_blocks = manager.append_slots(req0, 15) + new_blocks = manager.allocate_slots(req0, 15) assert new_blocks is not None and len(new_blocks) == 0 assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None @@ -185,7 +186,7 @@ def test_decode(): # the preallocated block. for _ in range(6 + 11): req0.append_output_token_ids(12) - new_blocks = manager.append_slots(req0, 17) + new_blocks = manager.allocate_slots(req0, 17) # Plus one preallocated block. assert new_blocks is not None and len(new_blocks) == 2 @@ -395,12 +396,14 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int): req.num_computed_tokens = block_size assert len(blocks) == 1 + num_preallocated_blocks - # Assume all computed. - manager.append_slots(req, block_size * (len(blocks) - 1)) - req.num_computed_tokens = block_size * len(blocks) + # Assume all computed, only when num_preallocate_tokens > 0, we need to + # consume the previously preallocated blocks. + if num_preallocated_blocks > 0: + manager.allocate_slots(req, block_size * (len(blocks) - 1)) + req.num_computed_tokens = block_size * len(blocks) # Append 1 block. - blocks = manager.append_slots(req, block_size) + blocks = manager.allocate_slots(req, block_size) assert len(blocks) == 1 + num_preallocated_blocks @@ -503,7 +506,7 @@ def test_mm_prefix_caching(): # Append slots without allocating a new block. for _ in range(5): req0.append_output_token_ids(8) - new_blocks = manager.append_slots(req0, 5) + new_blocks = manager.allocate_slots(req0, 5) assert new_blocks is not None and len(new_blocks) == 0 # The just completed block should have hashes with extra keys. @@ -603,7 +606,7 @@ def test_reset_prefix_cache(): unique_token_ids = [3] * 7 all_token_ids = full_block_token_ids + unique_token_ids req0 = make_request("0", all_token_ids) - blocks = manager.allocate_slots(req0, 55, []) + blocks = manager.allocate_slots(req0, 55) assert [b.block_id for b in blocks] == [0, 1, 2, 3] unique_token_ids = [4] * 7 @@ -626,33 +629,3 @@ def test_reset_prefix_cache(): assert manager.reset_prefix_cache() assert not manager.cached_block_hash_to_block assert all([blk.block_hash is None for blk in manager.block_pool]) - - -def test_uncache_blocks(): - manager = KVCacheManager( - block_size=16, - num_gpu_blocks=10, - max_model_len=8192, - sliding_window=None, - enable_caching=True, - num_preallocate_tokens=0, - ) - - req0 = make_request("0", list(range(30))) - blocks = manager.allocate_slots(req0, 30, []) - assert [b.block_id for b in blocks] == [0, 1] - assert len(manager.cached_block_hash_to_block) == 1 - - req0.num_computed_tokens = 30 - - # Simulate speculative tokens. - for _ in range(5): - req0.append_output_token_ids(8) - manager.append_slots(req0, 5) - assert len(manager.cached_block_hash_to_block) == 2 - - # After sampling, assuming only 1 token is accepted. - req0.num_computed_tokens = 31 - num_uncached_blocks = manager.uncache_blocks(req0) - assert num_uncached_blocks == 1 - assert len(manager.cached_block_hash_to_block) == 1 diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..8eb08f3e842cacda2537dc72baf907f8f093067e --- /dev/null +++ b/tests/v1/core/test_scheduler.py @@ -0,0 +1,214 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import List, Optional + +from vllm.config import CacheConfig, ModelConfig, SchedulerConfig +from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange +from vllm.sampling_params import SamplingParams +from vllm.v1.core.scheduler import Scheduler +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.request import Request, RequestStatus + + +def create_scheduler( + model: str = "facebook/opt-125m", + max_num_seqs: int = 16, + max_num_batched_tokens: int = 8192, +) -> Scheduler: + scheduler_config = SchedulerConfig( + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + max_model_len=max_num_batched_tokens, + ) + model_config = ModelConfig( + model=model, + task="auto", + tokenizer=model, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="float16", + seed=42, + ) + cache_config = CacheConfig( + block_size=16, + gpu_memory_utilization=0.9, + swap_space=0, + cache_dtype="auto", + ) + cache_config.num_gpu_blocks = 10000 + return Scheduler(scheduler_config, + model_config, + cache_config, + lora_config=None) + + +def create_requests( + num_requests: int, + num_tokens: int = 10, + mm_positions: Optional[List[PlaceholderRange]] = None, +): + sampling_params = SamplingParams() + requests = [] + for i in range(num_requests): + if mm_positions is not None: + mm_position = mm_positions[i] + mm_inputs = [MultiModalKwargs({})] * len(mm_position) + else: + mm_position = None + mm_inputs = None + request = Request( + request_id=f"{i}", + prompt=None, + prompt_token_ids=[i] * num_tokens, + sampling_params=sampling_params, + multi_modal_inputs=mm_inputs, + multi_modal_placeholders=mm_position, + multi_modal_hashes=None, + eos_token_id=None, + arrival_time=0, + ) + requests.append(request) + return requests + + +def test_add_requests(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + + for i, request in enumerate(requests): + scheduler.add_request(request) + assert request.request_id in scheduler.requests + assert len(scheduler.waiting) == i + 1 + + +def test_finish_request(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + for i, request in enumerate(requests): + scheduler.finish_requests(request.request_id, + RequestStatus.FINISHED_ABORTED) + assert request.request_id not in scheduler.requests + assert len(scheduler.waiting) == 9 - i + + +def test_get_num_unfinished_requests(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + for i, request in enumerate(requests): + scheduler.finish_requests(request.request_id, + RequestStatus.FINISHED_STOPPED) + assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1 + + +def test_schedule(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + # Test initial scheduling + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == len(requests) + assert len(output.scheduled_cached_reqs) == 0 + assert len(output.finished_req_ids) == 0 + # Verify all requests are scheduled. + for req_id, num_tokens in output.num_scheduled_tokens.items(): + assert num_tokens == len(requests[int(req_id)].prompt_token_ids) + + # Verify requests moved from waiting to running + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == len(requests) + for i, request in enumerate(requests): + assert scheduler.running[i] == request + + +def test_schedule_multimodal_requests(): + scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf") + mm_positions = [[PlaceholderRange(offset=i, length=100)] + for i in range(10)] + requests = create_requests( + num_requests=10, + num_tokens=200, + mm_positions=mm_positions, + ) + for request in requests: + scheduler.add_request(request) + + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == len(requests) + assert len(output.scheduled_cached_reqs) == 0 + assert len(output.finished_req_ids) == 0 + for req_id, num_tokens in output.num_scheduled_tokens.items(): + assert num_tokens == len(requests[int(req_id)].prompt_token_ids) + assert len(output.scheduled_encoder_inputs) == 10 + for req_id, encoder_input in output.scheduled_encoder_inputs.items(): + assert len(encoder_input) == 1 + + +def test_schedule_partial_requests(): + """Test scheduling behavior with partial requests. + + This test verifies that: + 1. The scheduler can handle multiple partial requests in a single step when + constrained by encoder budget. + 2. A request in RUNNING state may be unscheduled in subsequent steps if + there is insufficient encoder budget. + """ + scheduler = create_scheduler( + model="llava-hf/llava-1.5-7b-hf", + max_num_batched_tokens=1024, + ) + mm_positions = [[PlaceholderRange(offset=100, length=600)] + for _ in range(3)] + requests = create_requests( + num_requests=3, + num_tokens=800, + mm_positions=mm_positions, + ) + for request in requests: + scheduler.add_request(request) + + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == 3 + assert len(output.scheduled_cached_reqs) == 0 + assert len(output.finished_req_ids) == 0 + + assert scheduler.max_num_encoder_input_tokens == 1024 + # The first request is scheduled fully. + assert output.num_scheduled_tokens[requests[0].request_id] == 800 + # The second request is scheduled partially. + # The tokens are not scheduled because of the encoder budget. + assert output.num_scheduled_tokens[requests[1].request_id] == 100 + # The third request is also scheduled partially. + # The tokens are not scheduled because of the encoder budget. + assert output.num_scheduled_tokens[requests[2].request_id] == 100 + req_to_index = { + request.request_id: i + for i, request in enumerate(requests) + } + model_runner_output = ModelRunnerOutput( + req_ids=[request.request_id for request in requests], + req_id_to_index=req_to_index, + sampled_token_ids=[0] * len(requests), + logprob_token_ids_cpu=None, + logprobs_cpu=None, + ) + scheduler.update_from_output(output, model_runner_output) + + # Schedule the next step. + # Only the first and second requests are scheduled. + # The third request is in the RUNNING state but not scheduled in this step + # because of the encoder budget. + output = scheduler.schedule() + assert len(scheduler.running) == 3 + assert len(output.scheduled_new_reqs) == 0 + assert len(output.scheduled_cached_reqs) == 2 + assert len(output.finished_req_ids) == 0 + assert output.num_scheduled_tokens[requests[0].request_id] == 1 + assert output.num_scheduled_tokens[requests[1].request_id] == 700 + assert requests[2].request_id not in output.num_scheduled_tokens diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py index 8ec9f1ba3f55e6d31117db56525f8c2f97bd7f34..a8079dcce5e2f3efe7f66a5e46c4349031351487 100644 --- a/tests/v1/e2e/test_cascade_attention.py +++ b/tests/v1/e2e/test_cascade_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm import LLM, SamplingParams diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 533fe54e824320cf88a24dc86f6294887dfb5ccb..41c5dc4709a58dc4e3df9275e12e7e2e1c9fb3bd 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from contextlib import ExitStack from typing import List, Tuple diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py index 2e17770b5a8584b288ce04ddb397ad7b447adc61..e7d1cda4b57b5bc423a5deea4d1182d76ef3d05a 100644 --- a/tests/v1/engine/test_engine_args.py +++ b/tests/v1/engine/test_engine_args.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 1f803c3cccf5fca5b2f9c9a7b8c4bf65fa8299ec..dba05110a7809fd4d13956fc6ddbe6a23c8582d1 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time import uuid diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 9c52d994d56f8cf1a49b87954bda34788b5a7f14..b4e3e43307b570588166d8732cc7ae0b51450668 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import time import uuid diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index c58c8f69663fdbe941a5362b2e5f2f35dda73bc8..14bd2d2726c562cc436e90f8155bdf34a31cc531 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import os diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 5ebf72927cfd6389a58a1e3abfcff733cf7743e7..f7eedcb9c58d61049bc62bb1b37775d4de291266 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Set, Tuple import numpy as np diff --git a/tests/v1/test_stats.py b/tests/v1/test_stats.py index 580392ac5f446836d984fc9d2bee5c07904ef430..48419d8a2791feb61a82c7105959861f3c459826 100644 --- a/tests/v1/test_stats.py +++ b/tests/v1/test_stats.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pytest from vllm.sampling_params import SamplingParams diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py index ac773b611f4065dfe66222a2c8de0e56cec80a98..9b669ae006608c64b514198251a214b54413ce0c 100644 --- a/tests/v1/test_utils.py +++ b/tests/v1/test_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import torch diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 694ce81ff6e220e3b332cc45d1c12c0e016eaacd..5b40fbff8212e596bf0d89c0d3985712507123c1 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List, Set, Tuple import numpy as np diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py index 790e891ec837dc1f3694954a40c8e044652e9781..c039431494c4edd0648f6f13feb3d2fef97a4540 100644 --- a/tests/vllm_test_utils/setup.py +++ b/tests/vllm_test_utils/setup.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from setuptools import setup setup( diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py index 6505c81546bb0510d59175cf7448cf06f8680ef8..1d1219fbeffa154aff3f8112efe0192b3e504f36 100644 --- a/tests/vllm_test_utils/vllm_test_utils/__init__.py +++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ vllm_utils is a package for vLLM testing utilities. It does not import any vLLM modules. diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py index 1ddd3471d357b35d85ee2498175200dac09b6fa2..392fd2705fb2749ffef20e72b537a36ebd5481f4 100644 --- a/tests/vllm_test_utils/vllm_test_utils/blame.py +++ b/tests/vllm_test_utils/vllm_test_utils/blame.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import contextlib import dataclasses import sys diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py index a237f53a75d18d053975fba21013d59b5a163050..44d45f2621054266598746206b14979045524819 100644 --- a/tests/vllm_test_utils/vllm_test_utils/monitor.py +++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import contextlib import dataclasses import sys diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py index 8d0b13166cc6c92dc9e9d3d288ef2f7f6308d474..c2aae72feb45df89196b3417523d1ff681dcbee3 100644 --- a/tests/weight_loading/test_weight_loading.py +++ b/tests/weight_loading/test_weight_loading.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pytest diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index ce3fbc56fa3f7d7d875daa2025cf7b74c0065449..5384314cb7d2f7e589fbee4f2493dbb6face0e69 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from typing import List diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index 57f1fd47a600fb79e45b49635aa9b851f7a52f2e..eb341fb1b29314d335862d5f84aeaf43dc71df21 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import List, Tuple, Type diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 9216ac32d8d7cd50cdd8a92521aed413630c258c..f666a990d2d3b11e2c4c1cae4ebfe1a00ff60c1e 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import pytest @@ -25,6 +27,15 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: return model_runner +def test_deepseek_mla_attn_backend_module(): + model_runner = _create_model_runner( + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", + trust_remote_code=True, + enable_chunked_prefill=False, + ) + assert model_runner.attn_backend.__name__ == "TritonMLABackend" + + @pytest.mark.parametrize("batch_size", list(range(1, 257))) def test_prepare_prompt(batch_size): model_runner = _create_model_runner( diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py index c8df5ca120c1c39231be0c63c2b2bb3384da03bb..cb871a3c132ca83557cbc407c76f6aa47dae1196 100644 --- a/tests/worker/test_profile.py +++ b/tests/worker/test_profile.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import torch diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 0c56819e164746c8cc8b520338e8117c3a39f5ce..a7f44a3c575b9ae51d921853dd3a7f59652acd18 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import os diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py new file mode 100644 index 0000000000000000000000000000000000000000..709befc53207c26b5cb948f98aa97a2d0f68fc8d --- /dev/null +++ b/tools/check_spdx_header.py @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: Apache-2.0 + +import sys + +SPDX_HEADER = "# SPDX-License-Identifier: Apache-2.0" +SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:" + + +def check_spdx_header(file_path): + with open(file_path, encoding='UTF-8') as file: + lines = file.readlines() + if not lines: + # Empty file like __init__.py + return True + for line in lines: + if line.strip().startswith(SPDX_HEADER_PREFIX): + return True + return False + + +def add_header(file_path): + with open(file_path, 'r+', encoding='UTF-8') as file: + lines = file.readlines() + file.seek(0, 0) + if lines and lines[0].startswith("#!"): + file.write(lines[0]) + file.write(SPDX_HEADER + '\n') + file.writelines(lines[1:]) + else: + file.write(SPDX_HEADER + '\n') + file.writelines(lines) + + +def main(): + files_with_missing_header = [] + for file_path in sys.argv[1:]: + if not check_spdx_header(file_path): + files_with_missing_header.append(file_path) + + if files_with_missing_header: + print("The following files are missing the SPDX header:") + for file_path in files_with_missing_header: + print(f" {file_path}") + add_header(file_path) + + sys.exit(1 if files_with_missing_header else 0) + + +if __name__ == "__main__": + main() diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index 54cd60c2bc95b0e142d484987d3351b439d1da7f..adbb7301bfc76de94a18922aff4d606a593facb7 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import json from typing import Dict diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index cb56ebd69a8c1d54d1fcaf6651a715f25012e1f5..c527cdbe02259e9e481875bceec63d43e3267ecc 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import copy import json diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 9dc19f5fd4cdd6a4bd315f35fa5cdc43cd95f9e7..011af25229f4bd7f0d655a232ebd3b402a9d8cef 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -1,4 +1,6 @@ #!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 + # Copyright (c) 2018 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. diff --git a/use_existing_torch.py b/use_existing_torch.py index 319d262898fe3882e0ca0c6ea80a442ef66c464d..a578328b0357d7952fbd29b698fb0b4e4a0f2eb2 100644 --- a/use_existing_torch.py +++ b/use_existing_torch.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import glob requires_files = glob.glob('requirements*.txt') diff --git a/vllm/__init__.py b/vllm/__init__.py index 66ac5a64c1fe04b925d153c8318b0ee3d7397c34..0b09afb179fc6e4dd99eb7e7c22a9bd5cf4c8a52 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" import os diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index f51bac856764b27a26d2aedd71e7a10b237193e9..070b3865c57d86407c36917c117e0e4262fe7b86 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import contextlib import importlib from typing import TYPE_CHECKING, List, Optional, Tuple, Union, Type @@ -1369,6 +1371,15 @@ def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int, num_tokens_post_pad) +def sgl_moe_align_block_size(topk_ids: torch.Tensor, num_experts: int, + block_size: int, sorted_token_ids: torch.Tensor, + experts_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor) -> None: + torch.ops._moe_C.sgl_moe_align_block_size(topk_ids, num_experts, + block_size, sorted_token_ids, + experts_ids, num_tokens_post_pad) + + def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor, token_expert_indicies: torch.Tensor, gating_output: float) -> None: @@ -1445,6 +1456,11 @@ def copy_blocks(key_caches: List[torch.Tensor], torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) +def copy_blocks_mla(kv_caches: List[torch.Tensor], + block_mapping: torch.Tensor) -> None: + torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping) + + def swap_blocks(src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor) -> None: torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 28b804f765a3a9bca82f020d6c14b79a1810131d..ccb67baa53383b9748491036c1f9c8f56aa038f8 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple import torch diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py index 3ed60678b52f5bd3b09d195a531ad0e8ef747f16..18e0c5227d45c06b48a62fba99908d7521736b02 100644 --- a/vllm/adapter_commons/layers.py +++ b/vllm/adapter_commons/layers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Tuple diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py index 468904c90fff40ebe8307fb581add14d49e08a18..f9a5d2fffad5e62ba4bb69d9f31419bf8a281f3b 100644 --- a/vllm/adapter_commons/models.py +++ b/vllm/adapter_commons/models.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Any, Callable, Dict, Optional, TypeVar diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py index 2bb17fdc011096824891ba2b82bae91d7f25384d..2b604b91bbb6b43e0e5b63fc555c7cef15c88f3d 100644 --- a/vllm/adapter_commons/request.py +++ b/vllm/adapter_commons/request.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py index 1e9adca50093bf6bcaa86eba489b897c158287c0..c2dc5433cc65671fa99e2fb1ec4855cd4f1e2c68 100644 --- a/vllm/adapter_commons/utils.py +++ b/vllm/adapter_commons/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, Optional, Set diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py index 83929e82ebf0423b6475308725e43b7a79e4df65..ce24e08a5b56ef441f60d937c27193d3e446f10f 100644 --- a/vllm/adapter_commons/worker_manager.py +++ b/vllm/adapter_commons/worker_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Any, Optional, Set diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py index a46c67ad7e00e99587feb9629d5e39b15a9b91ae..d9e51082e6ca2a7ea54deb5c30a30b35c561eadf 100644 --- a/vllm/assets/audio.py +++ b/vllm/assets/audio.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Literal from urllib.parse import urljoin diff --git a/vllm/assets/base.py b/vllm/assets/base.py index 249173141106c6ea6d54610d6d5414d6c8ac0bb4..03f3b9dabf1438662d533be53d3abacc679c472b 100644 --- a/vllm/assets/base.py +++ b/vllm/assets/base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import lru_cache from pathlib import Path from typing import Optional diff --git a/vllm/assets/image.py b/vllm/assets/image.py index 0a55506f88255f0d170a262a64bb60e5ad316975..2b1d258da9c784ab2eeffbcb380afbf75c338183 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Literal diff --git a/vllm/assets/video.py b/vllm/assets/video.py index eca2ccc54482c714cf65096b42cab3085b4fa5a1..494cfc38381cfd70c93787a9eb013eec89425768 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from functools import lru_cache from typing import List, Literal diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py index 2cd4ad3e00135ea6638947de7a62fa9deb435e6f..85c5715faba7f11c95db33a9eb82a7f3946613f8 100644 --- a/vllm/attention/__init__.py +++ b/vllm/attention/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.attention.backends.abstract import (AttentionBackend, AttentionMetadata, AttentionMetadataBuilder, diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index b9425f659f7c04b060c16aaad533225b78243721..5f0a540135402e6afb9a37ea94a7d0b5bacc9325 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, fields diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 385fdab8f4c99c3297a475cb17b6b7161df89402..5ca8d913738ec8133c7c48e5e77a265aa6dd4d50 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 7ec5a5c97fa21901364d5e580b910dd64bcf009b..ce71a488eeea71b5c6274f4946cae71439876016 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Attention layer with FlashAttention.""" from collections import defaultdict from dataclasses import dataclass diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 6982fbacd3adb408045492c3f1132c629fbcab89..bb9e92f7770d8f0b81f10cbb1c7620154af3e398 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from collections import defaultdict from contextlib import contextmanager diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 80c132c0a8c05393c0f10dd23248e7bad0dd76bd..1ad5e6e8e4e17e57f1f8e073b90b812b72b4765c 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### @@ -8,7 +10,8 @@ from typing import Any, Dict, List, Optional, Tuple, Type import torch import vllm_hpu_extension.ops as ops -from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache +from vllm_hpu_extension.utils import (Matmul, ModuleFusedSDPA, Softmax, + VLLMKVCache) from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionLayer, @@ -135,9 +138,17 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', '0').lower() in ['1', 'true'] + self.fused_scaled_dot_product_attention = None if self.prefill_usefusedsdpa: assert alibi_slopes is None, \ 'Prefill with FusedSDPA not supported with alibi slopes!' + try: + from habana_frameworks.torch.hpex.kernels import FusedSDPA + self.fused_scaled_dot_product_attention = ModuleFusedSDPA( + FusedSDPA) + except ImportError: + logger().warning("Could not import HPU FusedSDPA kernel. " + "vLLM will use native implementation.") suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes() if head_size not in suppored_head_sizes: @@ -225,6 +236,7 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): matmul_qk_op=self.matmul_qk, softmax_op=self.softmax, matmul_av_op=self.matmul_av, + fsdpa_op=self.fused_scaled_dot_product_attention, ) output = out.reshape(batch_size, seq_len, hidden_size) else: diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 57916a3c6a34c9ea602fcf5159c8f5aaa6d3b92c..b4879af4cf20e93fcc9156cca0853b137a14f757 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py index e8fec234c0225fc394386043f8a4817b2e21e312..cd8c08e5ab47260af39beb7d8f4370a6bae46b72 100644 --- a/vllm/attention/backends/mla/utils.py +++ b/vllm/attention/backends/mla/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import abstractmethod from dataclasses import dataclass from typing import Any, Dict, Generic, List, Optional, Tuple @@ -24,8 +26,13 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8) from vllm.model_executor.layers.quantization.utils.quant_utils import ( scaled_dequantize, scaled_quantize) -from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding -from vllm.vllm_flash_attn import flash_attn_varlen_func +from vllm.model_executor.layers.rotary_embedding import ( + DeepseekScalingRotaryEmbedding, RotaryEmbedding) + +try: + from vllm.vllm_flash_attn import flash_attn_varlen_func +except ImportError: + from flash_attn import flash_attn_varlen_func @dataclass @@ -168,6 +175,8 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): self.v_head_dim = v_head_dim self.rotary_emb = rotary_emb + self.use_yarn_rope = isinstance(rotary_emb, + DeepseekScalingRotaryEmbedding) self.q_proj = q_proj self.kv_b_proj = kv_b_proj self.o_proj = o_proj @@ -414,6 +423,24 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): ) -> torch.Tensor: raise NotImplementedError + def apply_pure_rope( + self, + input_positions: torch.Tensor, + q_pe: torch.Tensor, + k_pe: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + seq_len = input_positions.size(0) + ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape + + q_pe, k_pe = self.rotary_emb( + input_positions, + q_pe.reshape(seq_len, -1), + k_pe.reshape(seq_len, -1), + ) + q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape) + + return q_pe, k_pe + def forward( self, layer: AttentionLayer, @@ -438,13 +465,14 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): # Restore head dim (for rotary embedding) k_pe = k_pe.unsqueeze(1) assert hasattr(attn_metadata, "input_positions") + rope_fn = (self.rotary_emb + if self.use_yarn_rope else self.apply_pure_rope) if is_decode: q_nope = self._q_proj_and_k_up_proj(hidden_states_or_q_c) q_pe = torch.matmul(hidden_states_or_q_c, self.W_QR)\ .view(-1, self.num_heads, self.qk_rope_head_dim) - q_pe, k_pe = \ - self.rotary_emb(attn_metadata.input_positions, q_pe, k_pe) + q_pe, k_pe = rope_fn(attn_metadata.input_positions, q_pe, k_pe) else: assert is_prefill q = self.q_proj(hidden_states_or_q_c)[0]\ @@ -452,7 +480,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): # TODO(lucas): there must be a nicer way to write this line q[..., self.qk_nope_head_dim:], k_pe = \ - self.rotary_emb( + rope_fn( attn_metadata.input_positions, q[..., self.qk_nope_head_dim:], k_pe) diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py index be06d1600998889e85b9ddef8b01f74846bfe3c2..9908620a32a238ac31202fe570ca0752cdc47213 100644 --- a/vllm/attention/backends/openvino.py +++ b/vllm/attention/backends/openvino.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Dict, List, Optional, Tuple, Type @@ -138,3 +140,7 @@ class OpenVINOAttentionMetadata: # `model_executable`. multi_modal_placeholder_index_maps: Optional[Dict[ str, MultiModalPlaceholderMap.IndexMap]] + + # Enable/disable KV scales calculation. This is so that we can disable the + # calculation until after prefill and cuda graph capture. + enable_kv_scales_calculation: bool diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 209a623ba441c0b222d20f838ac1f54ea0488b22..b61dfe63ddcaaffea67b617b4746d71eb0e07805 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 826311896d1d23fad15fd2a15157ceabba238dd2..9f6e731afd1930214cd35b9b700b3a00a70952c5 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import defaultdict from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 72fdcd6505cf3170d64f18a159dd726129360c44..e3eb70011dd1b0c11500105279038770f9d12359 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Attention layer ROCm GPUs.""" from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index c3b2398b4e632513c61f62a090c0f59546b541f5..25fe6ed95c5dff1ab32b43c9e841d376f9266602 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py index 95dc119a47bb530032f0d75fe9de209dc94bfd83..9a1984a931b55ab62e52243c88d7806ee1160cad 100644 --- a/vllm/attention/backends/triton_mla.py +++ b/vllm/attention/backends/triton_mla.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass @@ -24,7 +26,6 @@ from vllm.attention.backends.mla.utils import MLACommonImpl, MLACommonMetadata from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, compute_slot_mapping_start_idx, is_block_tables_empty) -from vllm.attention.ops.paged_attn import PagedAttention from vllm.attention.ops.triton_decode_attention import decode_attention_fwd from vllm.utils import async_tensor_h2d, make_tensor_with_pad @@ -70,14 +71,14 @@ class TritonMLABackend(AttentionBackend): dst_kv_cache: torch.Tensor, src_to_dst: torch.Tensor, ) -> None: - PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) + ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) @staticmethod def copy_blocks( kv_caches: List[torch.Tensor], src_to_dists: torch.Tensor, ) -> None: - PagedAttention.copy_blocks(kv_caches, src_to_dists) + ops.copy_blocks_mla(kv_caches, src_to_dists) @staticmethod def get_supported_head_sizes() -> List[int]: diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index c2a94286e6c82c53563616ee6d8b51c7a5f5250a..3fa8f9c57d6172c3e73ce35ee3ddc36a74efc1ee 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Attention backend utils""" from collections import defaultdict from contextlib import contextmanager diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 1ff54e2b320d2f1be6f872dfb2d7e6a8fa21a811..1504ed18788edddd8780395e48d04ee18daf0bc5 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Attention layer with xFormers and PagedAttention.""" from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index b97165f625e51207c833386ff9411db53b9d0774..e4df7ffc588544f43b68171b7ce4c5a300099b73 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Attention layer.""" from typing import Any, Dict, List, Optional @@ -155,9 +156,13 @@ class Attention(nn.Module): kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, ) -> torch.Tensor: - if self.calculate_kv_scales and \ - attn_metadata.enable_kv_scales_calculation: - self.calc_kv_scales(key, value) + # NOTE: please avoid accessing `kv_cache` and `attn_metadata` arguments + # directly, use `self.kv_cache` and + # `get_forward_context().attn_metadata` instead. + if self.calculate_kv_scales: + ctx_attn_metadata = get_forward_context().attn_metadata + if ctx_attn_metadata.enable_kv_scales_calculation: + self.calc_kv_scales(key, value) if self.use_output: output = torch.empty_like(query) hidden_size = query.size(-1) @@ -171,15 +176,27 @@ class Attention(nn.Module): if value is not None: value = value.view(-1, self.num_kv_heads, self.head_size) if self.use_direct_call: - unified_attention_with_output(query, key, value, output, - self.layer_name) + forward_context: ForwardContext = get_forward_context() + ctx_attn_metadata = forward_context.attn_metadata + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + self.impl.forward(self, + query, + key, + value, + self_kv_cache, + ctx_attn_metadata, + output=output) else: torch.ops.vllm.unified_attention_with_output( query, key, value, output, self.layer_name) return output.view(-1, hidden_size) else: if self.use_direct_call: - return unified_attention(query, key, value, self.layer_name) + forward_context = get_forward_context() + ctx_attn_metadata = forward_context.attn_metadata + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + return self.impl.forward(self, query, key, value, + self_kv_cache, ctx_attn_metadata) else: return torch.ops.vllm.unified_attention( query, key, value, self.layer_name) diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py index 727a470ba6d0ec9b110ba763e07d85f2f64b154f..71caf3cbac02c65095484eda96de4c5219bc56a5 100644 --- a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import triton import triton.language as tl diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py index 350f88c8f9740c1d741cc5f425ec6a3b4e1d693b..6ab69ea5b4098d8a1cc0f3728c5f013f9564718d 100644 --- a/vllm/attention/ops/blocksparse_attention/interface.py +++ b/vllm/attention/ops/blocksparse_attention/interface.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math import torch diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py index 78d752230d6e7bc37c0d2cb615dd4e074ab97412..4de9bd530642806c3d842674442be98f4342f4db 100644 --- a/vllm/attention/ops/blocksparse_attention/utils.py +++ b/vllm/attention/ops/blocksparse_attention/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Helper functions for 3D sparse pattern # These function are not optimized and very inefficient. # Avoid calling them too frequent or use a cache mechanism. diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py index 4c0fb2a628361937a47969bfecede7a45f4c1bab..8bb536343ed8c45d166d13e642daf23464e90118 100644 --- a/vllm/attention/ops/hpu_paged_attn.py +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index 3a07184ed31f0213d1fb1b5c7434bebf90637c35..598ceea130d97b029cc766ebf22b8a01588b8e70 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List, Optional, Tuple try: diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py index 9de4ef7f5a1402918e5c0c3f500f5869ef9f8361..68aa63f5ac16c3f094fed3579a95a4716695c712 100644 --- a/vllm/attention/ops/nki_flash_attn.py +++ b/vllm/attention/ops/nki_flash_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass import neuronxcc.nki.isa as nisa diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 43647488181c607e1b8175d58f6b6a28960be7e8..4a36ce9f2f51566078ba43b59755d86794b77345 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import List, Optional, Tuple diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 0ba91fbb363a9fc13c682a2eb95c98510c14220e..198e747d0836db558a11b8637e4a2a42c47da0ee 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # The kernels in this file are adapted from LightLLM's context_attention_fwd: # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py @@ -9,9 +11,12 @@ from vllm.platforms import current_platform # Static kernels parameters # BASE_BLOCK = 128 if current_platform.has_device_capability(80) else 64 +# NUM_WARPS = 4 if current_platform.is_rocm() else 8 + BASE_BLOCK = 32 if current_platform.has_device_capability(80) else 32 NUM_WARPS = 8 + # To check compatibility IS_TURING = current_platform.get_device_capability() == (7, 5) diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py index 675df109b6c0e72c44cb64afbd0ec104bdc7827a..057fccb5e5981b7884db23984490145ab8d80d18 100644 --- a/vllm/attention/ops/triton_decode_attention.py +++ b/vllm/attention/ops/triton_decode_attention.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py # which was originally adapted from @@ -202,10 +204,10 @@ def _decode_att_m_fwd( Req_to_tokens.stride(0), q.stride(0), q.stride(1), - k_buffer.stride(-2), - k_buffer.stride(-1), - v_buffer.stride(-2), - v_buffer.stride(-1), + k_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + k_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) att_out.stride(0), att_out.stride(1), att_out.stride(2), @@ -436,10 +438,10 @@ def _decode_grouped_att_m_fwd( Req_to_tokens.stride(0), q.stride(0), q.stride(1), - k_buffer.stride(-2), - k_buffer.stride(-1), - v_buffer.stride(-2), - v_buffer.stride(-1), + k_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + k_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-3), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) + v_buffer.stride(-2), # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM) att_out.stride(0), att_out.stride(1), att_out.stride(2), diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index 212a696dc975e3b9fc379a643818b0f87eef284b..6295aee243b73e99dc4f0767014fe79f9cd1e0a7 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# SPDX-License-Identifier: Apache-2.0 """ Fused Attention =============== diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 4c6bbc7272280956448ce4c43edebc032caab5e0..26c6ac812a12507af0d8f91f13a7cd42a2ccfa59 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from contextlib import contextmanager from functools import cache diff --git a/vllm/beam_search.py b/vllm/beam_search.py index 026037e5434d1619339e05cf534994e18cd03cd1..97b2b630fc3e5101107aec09d856c49b48ff2539 100644 --- a/vllm/beam_search.py +++ b/vllm/beam_search.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 7f4f97466d503ee20f39749f1f0adc191b08cab0..979890170c16b08896321e449103e3b3f05534ff 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ast import copy import dataclasses diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 6385f1c5dbf816e3e0632444aea9a9fe143dfc86..a6f11a3af4d4c78219dd218754184ecf74a38be4 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import copy import dataclasses from contextlib import contextmanager diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 17eb0592ced6de7261b7c12848235f764f7f0b6d..20afe6967df39dd72dd3728a0995b54b49242f0a 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import inspect from typing import Callable, Dict, List, Optional, TypeVar, Union, overload from unittest.mock import patch diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index e15d7b315c50f01328eec7294a836e321c7bb9b5..9b0e9c5d04081013b9a2f3e0b791b0eab03a412a 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import operator from typing import Dict, Iterable, List, Optional, Tuple, Union diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index cde27bd1082124893211a767a3f68be61d768a99..0c3d8697b2375bb77c1ae9fd4e35eb1684be5c73 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, Dict, List, NamedTuple, Optional, Tuple import torch diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py index 924e26f2e262ef621105313c136b4258aee9fa23..b9a8d3112e7758fe71756699b57542bb31c34954 100644 --- a/vllm/compilation/fx_utils.py +++ b/vllm/compilation/fx_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import operator from typing import Iterable, Optional diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index f6846c08ac841af6bfce66a525e65d57068369e0..be663946f4d815db4ca9d48eca2d77f904db7b52 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import hashlib import inspect import types diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index b97e40415b41b4c6d2e56a6f7b9bcc89eb66a8d9..786c7c1e1859a64a6338d869812172a01b757341 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import time diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py index b6bcecdc89e2694fcbdeb25b433d21c1e7aeaa67..e6f6a60b25950eb50fce0c9c2b2905cb20ff433f 100644 --- a/vllm/compilation/multi_output_match.py +++ b/vllm/compilation/multi_output_match.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import abc import operator from abc import abstractmethod diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 34f5f355798b2d78631f0c8e77e1750ffa0b81b7..c7387fb7c2db959a8d78a38cf8daf57954fdba60 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List from torch import fx as fx diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py index ba28b1f0be7bd130a3026a88fc5f8f32183d8a74..292baae852822d739808aa9a71b58fee77a251b9 100644 --- a/vllm/compilation/reshapes.py +++ b/vllm/compilation/reshapes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Union import torch.fx diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index b8c52a7f468383d1d3ee72543c4074f0ff0729b3..1d2597e42711fcf99e1dc740463c9fdb7b91e295 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time import torch diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 58a8fa76f6ce221c93c2116d908d73340a03e65c..a8a283ddd8c0c0b637f547d6a675e4e6e3119aea 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import sys from abc import abstractmethod diff --git a/vllm/config.py b/vllm/config.py index 33a6f83186b597f7806c5333fbce994b1285501f..0794e07293d8ff327b0dc54dff1c8ad110560bee 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ast import copy import enum @@ -81,6 +83,12 @@ class SupportsHash(Protocol): ... +class ModelImpl(str, enum.Enum): + AUTO = "auto" + VLLM = "vllm" + TRANSFORMERS = "transformers" + + class ModelConfig: """Configuration for the model. @@ -165,6 +173,12 @@ class ModelConfig: `logits_processors` extra completion argument. Defaults to None, which allows no processors. generation_config: Configuration parameter file for generation. + model_impl: Which implementation of the model to use: + "auto" will try to use the vLLM implementation if it exists and + fall back to the Transformers implementation if no vLLM + implementation is available. + "vllm" will use the vLLM model implementation. + "transformers" will use the Transformers model implementation. override_generation_config: Override the generation config with the given config. """ @@ -228,6 +242,7 @@ class ModelConfig: generation_config: Optional[str] = None, enable_sleep_mode: bool = False, override_generation_config: Optional[Dict[str, Any]] = None, + model_impl: Union[str, ModelImpl] = ModelImpl.AUTO, ) -> None: self.model = model self.tokenizer = tokenizer @@ -239,6 +254,7 @@ class ModelConfig: self.code_revision = code_revision self.rope_scaling = rope_scaling self.rope_theta = rope_theta + self.model_impl = model_impl if hf_overrides is None: hf_overrides = {} @@ -738,7 +754,6 @@ class ModelConfig: @property def is_deepseek_mla(self) -> bool: - # TODO add deepseek_v3 return (hasattr(self.hf_text_config, "model_type")) \ and (self.hf_text_config.model_type in \ ('deepseek_v2', 'deepseek_v3'))\ @@ -970,6 +985,9 @@ class ModelConfig: @property def use_mla(self) -> bool: + if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE: + return False + if self.quantization is not None and self.quantization not in [\ "fp8", "compressed-tensors"]: logger.warning( @@ -981,8 +999,9 @@ class ModelConfig: # have fp8 for both weights and activations. if self.quantization == "compressed-tensors": quant_config = self._parse_quant_hf_config() - for group_name, cfg in quant_config.get("config_groups", - ("", {})).items(): + for group_name, cfg in quant_config.get("config_groups", { + "": {} + }).items(): act_cfg = cfg.get("input_activations", {}) act_type = None if act_cfg is None else act_cfg.get("type", "") w_cfg = cfg.get("weights", {}) @@ -996,8 +1015,7 @@ class ModelConfig: quant_config) return False - use_mla = (self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE) - return use_mla + return True @property def supported_runner_types(self) -> Set[RunnerType]: diff --git a/vllm/connections.py b/vllm/connections.py index 4c9f4f40cf64093f5d517db48c24097094521c05..dc060bb6f88a75f9b8d300c364d21594cf23d1f6 100644 --- a/vllm/connections.py +++ b/vllm/connections.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from pathlib import Path from typing import Mapping, MutableMapping, Optional from urllib.parse import urlparse diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 90c1438efbd083abed9f605682c3ecfe8bbdee92..d4d31c58dc8d4c607bf012129ed1d8f7883916f9 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from typing import List, Optional diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index 115f663e4ad346bbbb440579767e78a442237c1f..1966eac1cf9e0c0a8f8522a0a3e72c64f57d2952 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import deque from dataclasses import dataclass from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index c3e1665b4464e11f83e05806b3098815094e5b69..359b5b263f689ea471b67e7dc924fc504856a419 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, FrozenSet, List, Optional, Tuple from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index cb432db919c7317a5a4ec85610f4c0f71ec6d003..0b0197deb8d47f10ac11747f46523623552b7190 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index c38ae2dd6761b92bf50ed77f9cf3aecfef063d65..c388366b825f2c236712d4af068b47714701d95d 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import deque from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index ccdc5daa9595ce5f0cb65ec451a3a21b28729c3f..1ca9e49dac371bd9fd835bc36f6e654235ac5d6c 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Token blocks.""" import sys from bisect import bisect_left @@ -64,6 +65,15 @@ class PrefixCachingBlockAllocator(BlockAllocator): from 0 to num_blocks - 1. """ + # Note that we use 'None' as a string here instead of None because + # as of Python 3.12, hash(None) returns a constant predictable value. + # This could possibly make it easier to find and exploit hash + # collisions. 'None' as a string will be hashed differently per process, + # but consistently within the same process. This is the same as the + # behavior of None prior to Python 3.12. + _none_hash: int = hash('None') + + # Implements Block.Factory. def __init__( self, num_blocks: int, @@ -121,7 +131,6 @@ class PrefixCachingBlockAllocator(BlockAllocator): self.metric_data = CacheMetricData() - # Implements Block.Factory. def _create_block( self, prev_block: Optional[Block], @@ -736,6 +745,14 @@ class PrefixCachingBlock(Block): such as adapters that influence the block, apart from the token_ids. """ + # Note that we use 'None' as a string here instead of None because + # as of Python 3.12, hash(None) returns a constant predictable value. + # This could possibly make it easier to find and exploit hash + # collisions. 'None' as a string will be hashed differently per process, + # but consistently within the same process. This is the same as the + # behavior of None prior to Python 3.12. + _none_hash: int = hash('None') + def __init__( self, prev_block: Optional[Block], @@ -890,13 +907,13 @@ class PrefixCachingBlock(Block): is_first_block = self._prev_block is None prev_block_hash = ( - None if is_first_block else + self._none_hash if is_first_block else self._prev_block.content_hash # type: ignore ) # Previous block exists but does not yet have a hash. # Return no hash in this case. - if prev_block_hash is None and not is_first_block: + if prev_block_hash == self._none_hash and not is_first_block: return None self._cached_content_hash = PrefixCachingBlock.hash_block_tokens( @@ -906,8 +923,9 @@ class PrefixCachingBlock(Block): extra_hash=self._extra_hash) return self._cached_content_hash - @staticmethod - def hash_block_tokens(is_first_block: bool, + @classmethod + def hash_block_tokens(cls, + is_first_block: bool, prev_block_hash: Optional[int], cur_block_token_ids: List[int], extra_hash: Optional[int] = None) -> int: @@ -928,7 +946,8 @@ class PrefixCachingBlock(Block): Returns: - int: The computed hash value for the block. """ - assert (prev_block_hash is None) == is_first_block + if is_first_block and prev_block_hash is None: + prev_block_hash = cls._none_hash return hash((is_first_block, prev_block_hash, *cur_block_token_ids, extra_hash)) @@ -948,6 +967,14 @@ class ComputedBlocksTracker: cached block hashes in the allocator. """ + # Note that we use 'None' as a string here instead of None because + # as of Python 3.12, hash(None) returns a constant predictable value. + # This could possibly make it easier to find and exploit hash + # collisions. 'None' as a string will be hashed differently per process, + # but consistently within the same process. This is the same as the + # behavior of None prior to Python 3.12. + _none_hash: int = hash('None') + def __init__( self, allocator: DeviceAwareBlockAllocator, @@ -993,7 +1020,7 @@ class ComputedBlocksTracker: # We need to know the hash of the previous block to compute the hash of # the current block so that blocks could be uniquely identified across # sequences of prefixes. - prev_block_hash = (None if cur_num_blocks_recorded == 0 else + prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else block_hashes_recorded[-1]) # Only update the computed block hashes for the new blocks for i in range(cur_num_blocks_recorded, num_computed_blocks): @@ -1008,7 +1035,7 @@ class ComputedBlocksTracker: # This has to be kept in sync with the allocator's hash # calculation. block_hash = PrefixCachingBlock.hash_block_tokens( - is_first_block=prev_block_hash is None, + is_first_block=prev_block_hash == self._none_hash, prev_block_hash=prev_block_hash, cur_block_token_ids=block_token_ids, extra_hash=extra_hash, diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index 1c6578e4cc6ab4263fd665dc1674416bced93d79..910afdd9feff1d616339fda39071494f913b9c36 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Block manager utils.""" from vllm.sequence import SequenceGroup from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 2d6a132ed555b96e7ea4944e72102a87e2cce48d..c5b3b04f37ca3018f554d7670d4b3a8ba6fafebc 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A block manager that manages token blocks.""" from typing import Dict, List, Optional from typing import Sequence as GenericSequence diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index c9306518223a3ae51186405ca014cfdd3453198f..0e363eddc8a5e933173cc42f4c3a8eeafbe85c23 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum import heapq from abc import ABC, abstractmethod diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index 9c7e246e3c4eda09a1ce2cbbf702b6539b077816..b48ba87e95a0b10d623fcb67aed36e7f02c2f1c3 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum from abc import ABC, abstractmethod from typing import List diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index f9924be4a3835f15e516b53a085570244769bca1..70c22afa8e1583b0ef4c5e54ee5ef80972ca0a39 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Tuple from vllm.core.interfaces import AllocStatus, BlockSpaceManager diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 2bb961481e5fe1b9db9f257876971eb1f5817d3a..f507847ad82cf51598aca8a642131585e49076f9 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum import os import random diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index a43418dbb3b46a34d6ab9028f831966d70699279..f74ad9ac33852fd0ffaa8d2aefc7b35ac8d0e341 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # cumem-based pytorch pluggable allocator to implement sleep mode. # other approaches tried but failed: # - cuda-python package binding diff --git a/vllm/distributed/__init__.py b/vllm/distributed/__init__.py index db325cfabf55ef6e65ebd56df44f6d58cd084582..39955ddacfe947c6c673520e97acbd7a348e5ecb 100644 --- a/vllm/distributed/__init__.py +++ b/vllm/distributed/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .communication_op import * from .parallel_state import * from .utils import * diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index e13505dc37bb043fcc90eea8362f3c2521788678..0228264f91f9a8688cc885e9c8d9c090eb954321 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, Optional, Union import torch diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py index d5a53381ce621dc758cd1928e84998bdcb8cce05..010caf7ebac97292d178f2a3b893c236198f931d 100644 --- a/vllm/distributed/device_communicators/cuda_wrapper.py +++ b/vllm/distributed/device_communicators/cuda_wrapper.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """This file is a pure Python wrapper for the cudart library. It avoids the need to compile a separate shared library, and is convenient for use when we just need to call a few functions. diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 96b328006012dcbdd13e1a3923e4f91eb58ef308..bebd1c3b3b39ce1e62275d99e52a67d99ccc61fe 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ctypes from contextlib import contextmanager from typing import List, Optional, Union diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 1f78e10cc1dcddd7e25f049bbd7554ebe841c6d8..d8d6eed2dd7ecc46da58be1c3e5ebef77b9a7ab3 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ctypes import json import os diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py index cc9b19ce022b5381aa44c62fe5776f33ba138435..3f85da98aca43e76c7d175be3d286a3ccc77399c 100644 --- a/vllm/distributed/device_communicators/hpu_communicator.py +++ b/vllm/distributed/device_communicators/hpu_communicator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import torch.distributed as dist from torch.distributed import ProcessGroup diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index efc59987195f5a078f96519f67cb6f94d9066c5a..0ccd423121cb0d6ccb9621eb9f2b464bee6bb057 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Union # ===================== import region ===================== diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 5d7ea368564afe155515d8e18144757887f16dec..929676e4f1c3279669ef2cc5b831efad6e70da92 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # This file is a pure Python wrapper for the NCCL library. # The main purpose is to use NCCL combined with CUDA graph. # Before writing this script, we tried the following approach: diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 268edc0925fe8e66fb47b33a62dbe50eafbb642e..48ac81ac008b2914aaddbde9ebc828dface1a57c 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pickle import sys diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py index 765a0f9cb1c87ca4b6bc641b5d87f033bc930b07..7af7c65f64220cc226268209b519c7ce4791ba5f 100644 --- a/vllm/distributed/device_communicators/tpu_communicator.py +++ b/vllm/distributed/device_communicators/tpu_communicator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import torch diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py index eafd3c2f677496dd68442bcbfb679bdc81383938..79ccc101e08018d65f3a6c28665b22f31a0d99d1 100644 --- a/vllm/distributed/device_communicators/xpu_communicator.py +++ b/vllm/distributed/device_communicators/xpu_communicator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import torch.distributed as dist from torch.distributed import ProcessGroup diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py index 6089e3babac3e28d5169216467283bcfbcccbb6d..57c764b481c29f30ed0daf18db5968ade5b8c3c3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/base.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ KVConnectorBase Class for Distributed KV Cache & Hidden State communication diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 6372dab7260863aa6847ff8be12377c42272bc8d..fe480533458b886f2cfa69ddf92d085801ae4f41 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import importlib from typing import TYPE_CHECKING, Callable, Dict, Type diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index 7780e2dfa317d8bfe183e4f1c314b1d0ee51165b..2033e9762ac0b5491e0bc9c0bc457e7014d733ef 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Simple KV Cache Connector for Distributed Machine Learning Inference diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py index bad119a1aa9296a81576ddb0d331b161a6146117..845da7c501e888e2726d0fe5744180d6d594f795 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This file contains a new class `KVLookupBufferBase` that allows developers to think of KV cache operations as inserting new KV cache entries (`insert`) diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py index fe8d8d7375f369dcbcd9e02c8ce31cf095e3c0ee..5e1b62352d14c457ac69c3654f86a8687a288f78 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Implements a distributed key-value (KV) cache transfer mechanism. diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py index 4b0cb44cc5b81b6a86c5a1b99df0560e804d3395..40589fb3ef872e97dd0d2577965824548f180323 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/base.py +++ b/vllm/distributed/kv_transfer/kv_pipe/base.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This file defines an interface `KVPipeBase` that provides an abstraction for sending and receiving tensors, or None, via diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py index 8e4358672b74d16d2a42b349126cd153905aa574..58ab7f0b642433348fca1c96316b94f6ef49dafe 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import os import pickle diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py index 98222fa67e4920c57ddb6814a653b6757dafbf74..7aa53d07a9ef200d7ff3c425ccc7ebc252362cb9 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This module implements a PyNccl pipe for sending and receiving Optional[torch.Tensor] between distributed ranks with advanced diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py index 9ce97851dc849a0555ed785b6ae6bcbb429fa3f5..1e80e0bd7de865b2c87828002d1be55bf373fb54 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_agent.py +++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A centralized entrypoint to perform distributed KV cache transfer. This implementation is a shim wrapper on two APIs exposed by `kv_connector`: diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 7fe9b68d4b9e8ba4abbd96495e33dc41d64befef..321902d11fd73a9d1076e57abfee99bb80d03d83 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2023 The vLLM team. # Adapted from # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py @@ -327,9 +329,17 @@ class GroupCoordinator: return input_ if input_.is_cpu: - import intel_extension_for_pytorch as ipex - ipex.distributed.all_reduce(input_, group=self.device_group) - return input_ + try: + import intel_extension_for_pytorch as ipex + ipex.distributed.all_reduce(input_, group=self.device_group) + return input_ + except ImportError: + """ + Intel IPEX not found. Falling back to PyTorch native + all_reduce for CPU + """ + torch.distributed.all_reduce(input_, group=self.device_group) + return input_ if self.tpu_communicator is not None and \ not self.tpu_communicator.disabled: diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index dcfcb848cbe063aa37a68f27a10ec4f639358b53..84f8c0a8e51c548e580c5ee507bb85139260e431 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2023 The vLLM team. # Adapted from # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7698c1945f2d699148b63dd09918a14ecec8cb81..7a456357b701a8545d7e544cad7bf27c4e41fa43 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import dataclasses import json @@ -11,10 +13,10 @@ import vllm.envs as envs from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat, DecodingConfig, DeviceConfig, HfOverrides, KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PoolerConfig, PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig, TaskOption, TokenizerPoolConfig, - VllmConfig) + ModelConfig, ModelImpl, ObservabilityConfig, + ParallelConfig, PoolerConfig, PromptAdapterConfig, + SchedulerConfig, SpeculativeConfig, TaskOption, + TokenizerPoolConfig, VllmConfig) from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS @@ -200,6 +202,7 @@ class EngineArgs: generation_config: Optional[str] = None override_generation_config: Optional[Dict[str, Any]] = None enable_sleep_mode: bool = False + model_impl: str = "auto" calculate_kv_scales: Optional[bool] = None @@ -379,6 +382,18 @@ class EngineArgs: 'qualified names that can be passed with the `logits_processors` ' 'extra completion argument. Defaults to None, which allows no ' 'processors.') + parser.add_argument( + '--model-impl', + type=str, + default=EngineArgs.model_impl, + choices=[f.value for f in ModelImpl], + help='Which implementation of the model to use.\n\n' + '* "auto" will try to use the vLLM implementation if it exists ' + 'and fall back to the Transformers implementation if no vLLM ' + 'implementation is available.\n' + '* "vllm" will use the vLLM model implementation.\n' + '* "transformers" will use the Transformers model ' + 'implementation.\n') # Parallel arguments parser.add_argument( '--distributed-executor-backend', @@ -1032,6 +1047,7 @@ class EngineArgs: generation_config=self.generation_config, override_generation_config=self.override_generation_config, enable_sleep_mode=self.enable_sleep_mode, + model_impl=self.model_impl, ) def create_load_config(self) -> LoadConfig: diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 739ea06ae3818b5346219ba61b93750e9944866f..053635a28638363fba08da0dc4b67bfa466e2ed7 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import copy import time diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py index 4b18426252127c0979c42e85ad81c24f4773f25e..aa54c0693941fbc191be31ea9d26940673d60c51 100644 --- a/vllm/engine/async_timeout.py +++ b/vllm/engine/async_timeout.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Workaround for https://github.com/python/cpython/issues/86296 # # From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index b4cde5fc27065209f2ac8eba52155404c57a49fc..44e83dc5d3399af99bcce242a3f762282e24dc7a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import copy import time diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index b771c190dd82a8a50335a388e41ee4bbb9e64ec4..ce806b4a937a1d407d233ae1398b7ba4ad814f0d 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from typing import TYPE_CHECKING from typing import Counter as CollectionsCounter diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 5c7a430d11c5afe76d41ad346633ca29e4f6e9e7..7f0c2fa70c3f9c82ee0b5c7e24b75b84d0944ced 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ These types are defined in this file to avoid importing vllm.engine.metrics and therefore importing prometheus_client. diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index d9703b820a779567dc52b4758e88b0d2b2d3eab8..3cf1850ee65ad779562db62d5884fbcf9dd8395a 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import uuid from dataclasses import dataclass, field from enum import Enum diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 5237f63c34c01f5d97c807636891201096d1c2a8..85b5f31e3a4aa51ce0c4765b75667c7b3a051296 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import copy import pickle diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 166f89743b3cd03916adefafbf1d37cbc04bd3ab..a0dd79586588eb1089cfe10aa55de69ee3772b93 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle import signal from contextlib import contextmanager diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 50adaf4e591887db93c36a010f85d863bb5ffc01..4c8e295c138150595faa0005094a77bd6ac7c68e 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Callable, List diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 99c2baf3f4df45a7456b9da619b9c824a08dc0db..8ceef855e020ffb70ffe47b09700026bd59b46ed 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import functools from typing import Callable, List, cast diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index 55c56abea0da32c6ecb26d936f49f79330752f21..4d96791a1f8a389994e59061a3f15859d791dfa1 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List from vllm.config import SchedulerConfig diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 4b701f81504bb7d5839635a5277700ec42c7c991..3bca0bee35a4c8abbc565e442b61fa0e683cb4ff 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional, Tuple from vllm.lora.request import LoRARequest diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index 770982a207e6c350677c87e6159b6bae9b605551..0d2b58c109e32f27f780f4ef538331705db0321d 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List from typing import Sequence as GenericSequence from typing import cast diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index de7b2c1b91f50bc3b54932bdb79e3b7c0dff40ba..d1112558666ff37942bf440ee3c5a9bd42399758 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from abc import ABC, abstractmethod from typing import AsyncGenerator, List, Mapping, Optional diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index daefbff7e5178999e47f31fbf49c135baedb34d3..96818507d589fca1f4d602df8a25111d82a4e5d2 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ NOTE: This API server is used only for demonstrating usage of AsyncEngine and simple performance benchmarks. It is not intended for production use. diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 97d2561df602aa916dac7bd54a835b55ffaa8852..f04902ae1c7678c736bcb49450339eeafcbb6b75 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import codecs import json @@ -408,7 +410,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return "" if model_type == "mllama": return "<|image|>" - if model_type == "qwen2_vl": + if model_type in ("qwen2_vl", "qwen2_5_vl"): return "<|vision_start|><|image_pad|><|vision_end|>" if model_type == "molmo": return "" @@ -428,7 +430,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): return "()" raise TypeError(f"Unknown model type: {model_type}") elif modality == "video": - if model_type == "qwen2_vl": + if model_type in ("qwen2_vl", "qwen2_5_vl"): return "<|vision_start|><|video_pad|><|vision_end|>" if model_type in ("minicpmo", "minicpmv"): return "()" diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 5dcf50bd1b0a1bb7249cee95117d77bcdc0d5c23..351a39525fa621870b7d60192478abcd6a5746f2 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import signal from http import HTTPStatus diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 46b595b0da73cabcbbde3dfa34712f3ddae7bf91..d071a0b3cfc5d313bc0ef0055d861eca461acb6a 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools import warnings from contextlib import contextmanager diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py index 584ee0d9e1c54a5ac22aad4e79ba6835416cf0fe..e82b6ba6c7bae3c0496f395324b4a405dc34c435 100644 --- a/vllm/entrypoints/logger.py +++ b/vllm/entrypoints/logger.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Union from vllm.logger import init_logger diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9e5cf4ba2e490103a90f904fbfe473dc6b4e9adc..b8f54d6c78042dd944bd5410dd3a12ded13fa877 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import atexit import gc diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 9cfe07c65d55e927c2940226d81d32a9238073ae..3054958f3c8abc1e618c21a7fe260169efa6f23f 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This file contains the command line arguments for the vLLM's OpenAI-compatible server. It is kept in a separate file for documentation diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py index c8132811de9035f3fc190cfa72afe94f4c2903b4..41e5eef40eaf82ce3afa59523a2ed24796338abe 100644 --- a/vllm/entrypoints/openai/logits_processors.py +++ b/vllm/entrypoints/openai/logits_processors.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import lru_cache, partial from typing import Dict, FrozenSet, Iterable, List, Optional, Union diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 29d071ce50c8ec1ceefc8830e20117ff57d29cee..83b841826231ef17c2426f73adfda256942f41ce 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py import re diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/entrypoints/openai/reasoning_parsers/__init__.py index a21bff52f61faa3a3cf99ff88f93a0a2550183ed..80354d69b50afe64a38df6532af78e10cc6858fa 100644 --- a/vllm/entrypoints/openai/reasoning_parsers/__init__.py +++ b/vllm/entrypoints/openai/reasoning_parsers/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py index e5d10ee0bc3a812fcd77f467356236c65dd5ee41..b5df7e47446b7acc74cd77c4f905f87a95581716 100644 --- a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py +++ b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from functools import cached_property from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py index a440ddc8d3b5dc4b123d93ef7c360290ec173041..5c19888d4540137fb7d07150720b0ad5f5e849b9 100644 --- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py +++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import re from typing import Optional, Sequence, Tuple, Union diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 37ae23506acea6427c9ec29ce0bf07e4d62c6784..675d3cdcf97155073c07d8afbdd5e5878b23ba78 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from http import HTTPStatus from io import StringIO diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index dc97f0eb059d7f528da349d5c55c9eb6077789bc..107220d548afc0adf506fb24c6b1a10ea2a3b232 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import json import time diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 13c39263688906a94b2d92701b90c98825a511b1..e7ad263e7fbe5049dcab508139b67d0bbc2d5aa9 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import time from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index e7116a3d95d1078e306ee03f052534601df81308..45f8ad90ddcb3d67d56e49ccfa39bcc4ec2d135d 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import base64 import time diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 8d54164e500eb147ef5e8e17abeb035e7e5d8e5d..8d39fdcb748330545539b1e71c8cae88d71483c6 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from concurrent.futures.thread import ThreadPoolExecutor from http import HTTPStatus diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 22e74b387cd73995efcce1ec658f45d612a7cae5..f917a48519016c7300cac638796649bc79fc7a5d 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import pathlib from dataclasses import dataclass diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 5830322071e58a4381f6f4c10d1b2d5f9a305b2a..01a3d211f6ba633988782cbd7af6d71e556f72b2 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import base64 import time diff --git a/vllm/entrypoints/openai/serving_rerank.py b/vllm/entrypoints/openai/serving_rerank.py index be4420261afe3074e8ff899b02f741e07d898594..366df71217e9101c6d7b381bdf18efeef64752ff 100644 --- a/vllm/entrypoints/openai/serving_rerank.py +++ b/vllm/entrypoints/openai/serving_rerank.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 381edf8fac49ea16a45c862868b3b48f7059bf77..832aa8516cc359777e5e6326276a656fe7038dab 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import time from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index b67ecfb01316f5b67a96558457db1dcce10cde57..6c79adf90c8ad13e9afb640c278ebdec9d6c59ff 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Final, List, Optional, Union from fastapi import Request diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 2850349a4483574fa85a1f255ba9ac8072089ab4..d1c3afa64b96cc7216ecd9c8eed5f41336006a39 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .abstract_tool_parser import ToolParser, ToolParserManager from .granite_20b_fc_tool_parser import Granite20bFCToolParser from .granite_tool_parser import GraniteToolParser diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index aa7c201098935025a17f21fb947a332ab573ade6..7cdd6d4c4f2ba69d1616caa4b1e4cbd174a08f39 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from functools import cached_property from typing import Callable, Dict, List, Optional, Sequence, Type, Union diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 93e357e8b9f21d0ce51b7cd617efd04360bcacf9..002bf173883086f80bedcd61477ce9a0501e28fc 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import re from json import JSONDecoder diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index 8aefcd8d58a397efea0cf5d48641869217d26cb4..c948ed78f503b9bb9f760463846e5d459f11c21b 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from typing import Dict, Sequence, Union diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index 869d15ac359eada8cdc0a89f4c1981e8bc3fb369..4841b28703ee3beff672150f465577d57a17251b 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import re from typing import Dict, List, Sequence, Union diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index cb391e11bbde259c332c055b4cf9f2abaf4f1b22..b9215e7979bf534303ada53e5e7b8c9b54a89c08 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from typing import Dict, Sequence, Union diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index cfd024853f887b903b0fc3cec8bdbe8bc096d6de..7c4d63e18865376339d023ca5d91abb0d648b8ce 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import re from typing import Dict, List, Sequence, Union diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 1856308b88cfaa27d9042c808f6b9dad64cd1088..6a7b113623e6515d3a31307f1ec42c3451a5fec3 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import re from json import JSONDecoder diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index bada805dd35b9aba437fd83f5e7e3c350333eb0e..51354f7c9562355c9791cee5bc695097646f9984 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json import re from random import choices diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index 26da4d689fb8b56ecb2bbc1dca01a160b313664a..5c282b5c2605a6cc8a9c85a377cf7f31c8aab967 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import ast import json import re diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py index 5e4eb23bfaf4336518292199de0a02cbcee59baa..945cbd6835028b3d73297b20cdd32b69f9584112 100644 --- a/vllm/entrypoints/openai/tool_parsers/utils.py +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import json from json import JSONDecodeError, JSONDecoder from typing import Any, List, Tuple diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index e8a78d216d0f05b70d2bc5eb4b75231d69da8ffa..9af37871d57c8afb26a0c534db4d98938f1b8aa9 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import functools diff --git a/vllm/envs.py b/vllm/envs.py index efe576263e60ec395da643269c598e18d1815e59..dfea2bfc3f40cdb109e1437ce32296d22ea5ae94 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import tempfile from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional @@ -34,6 +36,7 @@ if TYPE_CHECKING: VLLM_LOGGING_LEVEL: str = "INFO" VLLM_LOGGING_PREFIX: str = "" VLLM_LOGGING_CONFIG_PATH: Optional[str] = None + VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None @@ -86,6 +89,10 @@ if TYPE_CHECKING: VLLM_MLA_DISABLE: bool = False VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True VLLM_MLA_DISABLE_REQUANTIZATION: bool = False + VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True + VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False + VLLM_RAY_PER_WORKER_GPUS: float = 1.0 + VLLM_RAY_BUNDLE_INDICES: str = "" def get_default_cache_root(): @@ -309,6 +316,14 @@ environment_variables: Dict[str, Callable[[], Any]] = { "VLLM_LOGGING_PREFIX": lambda: os.getenv("VLLM_LOGGING_PREFIX", ""), + # if set, vllm will call logits processors in a thread pool with this many + # threads. This is useful when using custom logits processors that either + # (a) launch additional CUDA kernels or (b) do significant CPU-bound work + # while not holding the python GIL, or both. + "VLLM_LOGITS_PROCESSOR_THREADS": + lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0")) + if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None, + # Trace function calls # If set to 1, vllm will trace function calls # Useful for debugging @@ -565,7 +580,34 @@ environment_variables: Dict[str, Callable[[], Any]] = { # matrices to match the activation type. This can lead to higher memory and # compute usage but better preserves the accuracy of the original model. "VLLM_MLA_DISABLE_REQUANTIZATION": - lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))) + lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))), + + # If set, vLLM will use the Triton implementation of moe_align_block_size, + # i.e. moe_align_block_size_triton in fused_moe.py. + "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON": + lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0")) + ), + + # Number of GPUs per worker in Ray, if it is set to be a fraction, + # it allows ray to schedule multiple actors on a single GPU, + # so that users can colocate other actors on the same GPUs as vLLM. + "VLLM_RAY_PER_WORKER_GPUS": + lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")), + + # Bundle indices for Ray, if it is set, it can control precisely + # which indices are used for the Ray bundle, for every worker. + # Format: comma-separated list of integers, e.g. "0,1,2,3" + "VLLM_RAY_BUNDLE_INDICES": + lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""), + + # When on a Nvidia GPU aligns single entries (within a page) so they are 256 + # byte aligned for better performance, this increases the memory usage of + # the cache. Currently this only affects MLA that results in non-256 + # byte aligned entries. This matches the alignment the CUDA runtime uses + # for all allocations. Currently this primarily affects MLA, for most other + # models the alignment is already naturally aligned to 256 bytes. + "VLLM_CUDA_MEM_ALIGN_KV_CACHE": + lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))), } # end-env-vars-definition diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 471d1bfac3119bfbae1c9115af5597354c826a1e..fb76276bb4b3476f70d4eb6b619bd40ead168dad 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from abc import ABC, abstractmethod from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py index 78c86321d861d08c88086c9ca5b51ae031f906ee..d1f8c36fbbec7638f89c0e828cbc42e3a66bee02 100644 --- a/vllm/executor/mp_distributed_executor.py +++ b/vllm/executor/mp_distributed_executor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os from typing import Any, Callable, List, Optional, Union diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index c467115f124cafef1e1498ffb227d72e294023cc..e680d53cbd10e216407f0850a84d2f0f688967d9 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from array import array from typing import Any, Type diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index 539b6ae2d3572eb55eba566fdfe5af2b5d41cb3f..cef6a994a9c09cfaea91f8240d6c64a253d88d80 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os import sys diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 2afd99f99b353035f6f8948c6859790922be935e..6a25a4d50fb98b083b83a84a770429144fc0e3c7 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os from collections import defaultdict @@ -127,13 +129,7 @@ class RayDistributedExecutor(DistributedExecutorBase): def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwargs): - if (self.parallel_config.tensor_parallel_size == 1 - and self.parallel_config.pipeline_parallel_size == 1): - # For single GPU case, we use a ray worker with constrained memory. - num_gpus = self.cache_config.gpu_memory_utilization - else: - # Otherwise, the ray workers are allocated with a full GPU. - num_gpus = 1 + num_gpus = envs.VLLM_RAY_PER_WORKER_GPUS # The driver dummy worker does not actually use any resources. # It holds the resource for the driver worker. @@ -153,12 +149,29 @@ class RayDistributedExecutor(DistributedExecutorBase): logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) # Create the workers. - driver_ip = get_ip() - rank = 0 + bundle_indices: List[int] + if envs.VLLM_RAY_BUNDLE_INDICES: + # Use the bundle indices specified by the user. + bundle_indices = list( + map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(","))) + assert len(bundle_indices) == self.parallel_config.world_size, \ + ("VLLM_RAY_BUNDLE_INDICES must have the same size" + f" as the world size, but got {bundle_indices=} " + f"and {self.parallel_config.world_size=}") + assert len(set(bundle_indices)) == len(bundle_indices), \ + ("VLLM_RAY_BUNDLE_INDICES cannot have duplicate values," + f" but got {bundle_indices=}") + else: + # use the first N bundles that have GPU resources. + bundle_indices = [] + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if bundle.get(current_platform.ray_device_key, 0): + bundle_indices.append(bundle_id) + bundle_indices = bundle_indices[:self.parallel_config.world_size] + worker_metadata: List[RayWorkerMetaData] = [] - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get(current_platform.ray_device_key, 0): - continue + driver_ip = get_ip() + for rank, bundle_id in enumerate(bundle_indices): scheduling_strategy = PlacementGroupSchedulingStrategy( placement_group=placement_group, placement_group_capture_child_tasks=True, @@ -185,7 +198,6 @@ class RayDistributedExecutor(DistributedExecutorBase): rpc_rank=rank) worker_metadata.append( RayWorkerMetaData(worker=worker, created_rank=rank)) - rank += 1 worker_ips = ray.get([ each.worker.get_node_ip.remote() # type: ignore[attr-defined] diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index e55155ea06225301d43e39fbfd89c18295275af7..7b30155971a6d353f5b6f18ac894ac2c2b6d1bef 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import time from collections import defaultdict @@ -212,7 +214,10 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): logger.info( "Waiting for creating a placement group of specs for " "%d seconds. specs=%s. Check " - "`ray status` to see if you have enough resources.", + "`ray status` to see if you have enough resources," + " and make sure the IP addresses used by ray cluster" + " are the same as VLLM_HOST_IP environment variable" + " specified in each node if you are running on a multi-node.", int(time.time() - s), placement_group_specs) try: diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index a5c4dcf0ec7f9c5e0f9cbf5fce00bddba00d6b50..dcb4a8f27c25213365a4422d912d535e5e845d6f 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Any, Callable, Dict, List, Optional, Tuple, Union diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 828b394ec5d21e5ad91a7ffe6eb7d2c77351ff4f..10de8bc593ab805ec06664c2ed0125bdb4a708b7 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from collections import defaultdict from contextlib import contextmanager diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index a0dd89f69bacd50bbd7ef90037e998c5bf228bfc..6f8f2cd758f7bf9768b1a552dbb4e2ad980c7278 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType, SingletonInputs, SingletonInputsAdapter, SingletonPrompt, diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index b06fbfc552d44cc33bbe4c5dfa24842cf3b5f246..db25936fe69cff24624dd3f4a4917d96a4fee489 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from functools import cached_property from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal, diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index 09f1ff2cb42e9301c5d3ef1a125982c70cd56f22..454d9d8303b77bfec97a597e4972ae18775c3614 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Literal, Sequence, TypedDict, Union, cast, overload from typing_extensions import TypeIs diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 70372e0cad22d001049c6dfaf2b0f510e1112ea3..4d8f28cb0417533e73244f5df1f5a1e4f78f6daf 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from typing import List, Mapping, Optional, Union diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 4b73ade7af5f068b3da0bbd986329f3c5ece7dd0..cd421443981f53eb25d8944ab2fbfca50b486783 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import functools from collections import UserDict from dataclasses import dataclass @@ -29,6 +31,17 @@ C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig) P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin) +class HashableDict(dict): + """ + A dictionary that can be hashed by lru_cache. + """ + + # NOTE: pythonic dict is not hashable, + # we override on it directly for simplicity + def __hash__(self) -> int: # type: ignore[override] + return hash(frozenset(self.items())) + + @dataclass(frozen=True) class InputContext: """ @@ -102,6 +115,13 @@ class InputContext: if isinstance(typ, type): merged_kwargs["processor_cls"] = typ + # NOTE: Pythonic dict is not hashable and will raise unhashable type + # error when calling `cached_get_processor`, therefore we need to + # wrap it to a hashable dict. + for key, value in merged_kwargs.items(): + if isinstance(value, dict): + merged_kwargs[key] = HashableDict(value) + hf_processor = cached_get_processor( self.model_config.model, trust_remote_code=self.model_config.trust_remote_code, diff --git a/vllm/logger.py b/vllm/logger.py index cac174f7ba02a07998e8426905dc19760ee51fab..b20d55e3c1019ba7fd8d15bd6950f4c094cc2316 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Logging configuration for vLLM.""" import datetime import json diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py index 576ccf78a811729be3b6b17fbeee36cf3e6239cf..7ab4632589bf499cf967a02794e0381413fb946e 100644 --- a/vllm/logging_utils/__init__.py +++ b/vllm/logging_utils/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.logging_utils.formatter import NewLineFormatter __all__ = [ diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py index b24b4e11d1fcbf017c066bda444259cd68b663a4..010b0a124987b5ff75820d3ff5067ea3803cf3cc 100644 --- a/vllm/logging_utils/formatter.py +++ b/vllm/logging_utils/formatter.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging diff --git a/vllm/logits_process.py b/vllm/logits_process.py index 7716ccd27e253f0574ffc52f5d2ac44b04129e24..d02072e8f81894894348c0501eb2ae6ec2081962 100644 --- a/vllm/logits_process.py +++ b/vllm/logits_process.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Tuple, Union import torch diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 545ec21ca74c1d1ded8b600ae7f072b7d7960557..3d6620817b4bb4f78d46d91e64aef231522fda4f 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # pylint: disable=unused-argument from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index cdd439d0385b6ae08fa7167f47e1961964c915b6..9f0297596ccbf236266f6e9daedfe1666dcf0d90 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # pylint: disable=unused-argument import math from dataclasses import dataclass diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index 93ad4651f4b774686ca52e754db5115e61e7b063..00299bf6c2a81446e5faa1ddf1ce2868f6b95046 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional from typing import Sequence as GenericSequence diff --git a/vllm/lora/models.py b/vllm/lora/models.py index d93fa15c1f70672d9bab74743b80452eda154c6e..560e6116b411e46af16955195d22d0e85fcd143f 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import copy import math import os diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py index 9c9159b95f30828280907599b04ca9dcd140974b..85601d58c9d73d4680806d13d0414d238454da10 100644 --- a/vllm/lora/ops/torch_ops/__init__.py +++ b/vllm/lora/ops/torch_ops/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand # noqa: F401 from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink, sgmv_expand, sgmv_expand_slice, diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py index 5f5aafd51615911cc9b7dd5e2979d7993b72ae87..af79f98415cbc1d83bd0e3446f4596562f776315 100644 --- a/vllm/lora/ops/torch_ops/lora_ops.py +++ b/vllm/lora/ops/torch_ops/lora_ops.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py index 9805b6dd5038eb2b8d2d17d4a1473048ec86236b..dc440f7327fa4316cf5bccd494c57c8c79602786 100644 --- a/vllm/lora/ops/triton_ops/__init__.py +++ b/vllm/lora/ops/triton_ops/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink diff --git a/vllm/lora/ops/triton_ops/bgmv_expand.py b/vllm/lora/ops/triton_ops/bgmv_expand.py index 42adb191b8eadc93b7b82dbde1ff33b44e3a9470..98510b39661a60c9777935fc39b56a248bc25ad1 100644 --- a/vllm/lora/ops/triton_ops/bgmv_expand.py +++ b/vllm/lora/ops/triton_ops/bgmv_expand.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/bgmv_expand_slice.py b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py index f397d752a3ea920a83e1a0af777a94c809585444..48804123c1eae1d3a601bd644025759ebfc76c9f 100644 --- a/vllm/lora/ops/triton_ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/bgmv_shrink.py b/vllm/lora/ops/triton_ops/bgmv_shrink.py index f3ef01d39e7764275d9f618a7af953130bfbd70c..227a5765e56be6450ffbe0d4665634e3e30c322d 100644 --- a/vllm/lora/ops/triton_ops/bgmv_shrink.py +++ b/vllm/lora/ops/triton_ops/bgmv_shrink.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py index 48fa5cd63741f5ce4997971790b98c31930de1f3..a8e71cacfe5a2e76b9ab1577b4082a977ed37d50 100644 --- a/vllm/lora/ops/triton_ops/sgmv_expand.py +++ b/vllm/lora/ops/triton_ops/sgmv_expand.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py index 9bb35e8ffd323f9750f5e8bca3fd9da11d14b35a..8b26583c11c14eb4c649fc7706ffe4319a96df2f 100644 --- a/vllm/lora/ops/triton_ops/sgmv_shrink.py +++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py index 7df5bc2c225e5e2398be9a448866a8136639ac4f..78409b91a14e80177d710f0ab8d7d02a950b16fa 100644 --- a/vllm/lora/ops/triton_ops/utils.py +++ b/vllm/lora/ops/triton_ops/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import functools from typing import Dict, List, Tuple diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index b9c506f6e0bfd7995022ce835be0d7f299cf4fc3..9496ab5a75c0710b2a0957ea68cb0b76684b37a8 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py import json diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py index 48ada3926ea4681a97df6cfbc0547904c434a1a1..915fc6623398e2aa2ff67723aa3770d35b4aa1db 100644 --- a/vllm/lora/punica_wrapper/__init__.py +++ b/vllm/lora/punica_wrapper/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index b9ec0c4bc632392b6d6e872a61421b6ed36105c2..1a2282ae9accd5ca26691f64d4e4d7ca6dce0d49 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py index b9ae3e07492c0faf53100386fb6ea07efd05b78e..29428f4cfff3175e782618cbd16c727d56771798 100644 --- a/vllm/lora/punica_wrapper/punica_cpu.py +++ b/vllm/lora/punica_wrapper/punica_cpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, Optional, Tuple, Union import torch diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 451f23e49f27cc32be1487a5358286b934287edc..9ccd9c36a073ecd5b0ff1f3f815a797977d0e445 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py index d9c4f44a1c282ce6fad8d44fd478faba21c5d11b..51e1bfab3f5136ab17732b2578d864fe3e0043d7 100644 --- a/vllm/lora/punica_wrapper/punica_hpu.py +++ b/vllm/lora/punica_wrapper/punica_hpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple, Union, final import torch diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py index a2932246519922dafc2e10f0857da95a0c3653cc..ad5d4b788ec435a970f35d1625e127e3d3812bcd 100644 --- a/vllm/lora/punica_wrapper/punica_selector.py +++ b/vllm/lora/punica_wrapper/punica_selector.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils import resolve_obj_by_qualname diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py index 7360c8c09e3acc6c8762fbc6225aeb76683b4ac1..dbc2d27c597f20c8a5aa79e87b96b8f76e9979bc 100644 --- a/vllm/lora/punica_wrapper/utils.py +++ b/vllm/lora/punica_wrapper/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, List, Optional, Tuple, Union import torch diff --git a/vllm/lora/request.py b/vllm/lora/request.py index b3cfca3dfc8769c8e11c6ef1ee6f3ba3dd0eafa3..8fd003173243f23fefdee893bc1f13b073ecf885 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import warnings from typing import Optional @@ -31,7 +33,7 @@ class LoRARequest( base_model_name: Optional[str] = msgspec.field(default=None) def __post_init__(self): - if 'lora_local_path' in self.__struct_fields__: + if self.lora_local_path: warnings.warn( "The 'lora_local_path' attribute is deprecated " "and will be removed in a future version. " diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index d72b7638d84af5b8e323b69921eb2483e09dad21..f47b0af1552262c3007daa042caff49cf30c4cbc 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import re from typing import List, Optional, Set, Tuple, Type, Union diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index a64296f7fd9021dde79f415237fe349f55ccaeed..f33a7b88cc35ffc53eab9eebb6072cdc674194d2 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from contextlib import contextmanager from typing import Any, Dict, List, Literal, Optional, Set, Type, Union diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index 7278c7fbe8bea258846e8ed69f1d84183d3bd7a6..7636152176f13962a68576a432582e261d5c1fca 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.model_executor.parameter import (BasevLLMParameter, PackedvLLMParameter) from vllm.model_executor.sampling_metadata import (SamplingMetadata, diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 96995c56bf504616b38ade92e137773ff7d572e9..ee4f41ea6ec9080ece0175c54d1b7027cbb109ae 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, Type import torch.nn as nn diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 18b435a42544a2b12969e2dc754998cae890abee..cf96461a549f39450a8c424805e9a72be5c0969d 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from __future__ import annotations from typing import TYPE_CHECKING diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py index 8deb4c949824a39b5256c08c5236c29a8046e494..db4ce26806c1ffb48b9ad31ac70af27938d69f1d 100644 --- a/vllm/model_executor/guided_decoding/guided_fields.py +++ b/vllm/model_executor/guided_decoding/guided_fields.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Dict, List, Optional, TypedDict, Union diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py index a17e75a80300f07e25f06cdae7f2bf925dff7593..7eaf9e38e66a33ab3fc450b5053366dc5860de63 100644 --- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import lru_cache from json import loads as json_loads from typing import Optional, Union diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index eb8db882435e604c084dd2e6e3820b9233845c04..ba9c98290368260ad57d08bb9d1fae5bb9de9849 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import concurrent.futures import os diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index e4eb3f16e56cf33141b8ae91861597a2b5eee90f..ab72b55a8943588e4a6529c32cb4d575d1f74a15 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024- the Outlines developers # This file is adapted from # https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py index 90dfa62ec46702c4bc087d728ab315e5883a4063..87ef4535845773fbfce1a2d95b34a89e37f5d967 100644 --- a/vllm/model_executor/guided_decoding/utils.py +++ b/vllm/model_executor/guided_decoding/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import re diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index ee30ce96f0a1ebb1438bed7ac3aca18c609c0885..c01bd3af1d5b990cb0b6e15b22a63c13f7509c97 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # noqa: UP007 from __future__ import annotations diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index e5f25189d8e7ccce3cc7c619a9853f7bc9deb658..8aba4601bbb63587d0b0e9c3d65c3a940d16ca61 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Custom activation functions.""" import math from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index c4223d12600aca2d184c056aa629b0572628c660..6f933c3fa3c9f86d9b52230c059e244cef4ca8a5 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from contextlib import contextmanager from typing import Any, Dict, Optional diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 87993267c05b5b8fa15db07b3e75ed7b22d08854..4ca569ca4f19b360966b348980eaaa33af8b2ba6 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Fused MoE utilities for GPTQ.""" import functools from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 10bb5457a21e79d1c9a10d0a0563727b50867843..c59f9ffef1ae77c7de0c7ed793f2d5450368c200 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Fused MoE kernel.""" import functools import json @@ -417,6 +418,144 @@ def fused_moe_kernel( tl.store(c_ptrs, accumulator, mask=c_mask) +def ceil_div(a, b): + return (a + b - 1) // b + + +@triton.jit +def moe_align_block_size_stage1( + topk_ids_ptr, + tokens_cnts_ptr, + num_experts: tl.constexpr, + numel: tl.constexpr, + tokens_per_thread: tl.constexpr, +): + pid = tl.program_id(0) + + start_idx = pid * tokens_per_thread + + off_c = (pid + 1) * num_experts + + for i in range(tokens_per_thread): + if start_idx + i < numel: + idx = tl.load(topk_ids_ptr + start_idx + i) + token_cnt = tl.load(tokens_cnts_ptr + off_c + idx) + tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1) + + +@triton.jit +def moe_align_block_size_stage2( + tokens_cnts_ptr, + num_experts: tl.constexpr, +): + pid = tl.program_id(0) + + last_cnt = 0 + for i in range(1, num_experts + 1): + token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid) + last_cnt = last_cnt + token_cnt + tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt) + + +@triton.jit +def moe_align_block_size_stage3( + total_tokens_post_pad_ptr, + tokens_cnts_ptr, + cumsum_ptr, + num_experts: tl.constexpr, + block_size: tl.constexpr, +): + last_cumsum = 0 + off_cnt = num_experts * num_experts + for i in range(1, num_experts + 1): + token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1) + last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size + tl.store(cumsum_ptr + i, last_cumsum) + tl.store(total_tokens_post_pad_ptr, last_cumsum) + + +@triton.jit +def moe_align_block_size_stage4( + topk_ids_ptr, + sorted_token_ids_ptr, + expert_ids_ptr, + tokens_cnts_ptr, + cumsum_ptr, + num_experts: tl.constexpr, + block_size: tl.constexpr, + numel: tl.constexpr, + tokens_per_thread: tl.constexpr, +): + pid = tl.program_id(0) + start_idx = tl.load(cumsum_ptr + pid) + end_idx = tl.load(cumsum_ptr + pid + 1) + + for i in range(start_idx, end_idx, block_size): + tl.store(expert_ids_ptr + i // block_size, pid) + + start_idx = pid * tokens_per_thread + off_t = pid * num_experts + + for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread, + numel)): + expert_id = tl.load(topk_ids_ptr + i) + token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id) + rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id) + tl.store(sorted_token_ids_ptr + rank_post_pad, i) + tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1) + + +# Triton implementation based on: +# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0 +def moe_align_block_size_triton( + topk_ids: torch.Tensor, + num_experts: int, + block_size: int, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_pad: torch.Tensor, +) -> None: + numel = topk_ids.numel() + grid = (num_experts, ) + tokens_cnts = torch.zeros((num_experts + 1, num_experts), + dtype=torch.int32, + device=topk_ids.device) + cumsum = torch.zeros((num_experts + 1, ), + dtype=torch.int32, + device=topk_ids.device) + tokens_per_thread = ceil_div(numel, num_experts) + + moe_align_block_size_stage1[grid]( + topk_ids, + tokens_cnts, + num_experts, + numel, + tokens_per_thread, + ) + moe_align_block_size_stage2[grid]( + tokens_cnts, + num_experts, + ) + moe_align_block_size_stage3[(1, )]( + num_tokens_post_pad, + tokens_cnts, + cumsum, + num_experts, + block_size, + ) + moe_align_block_size_stage4[grid]( + topk_ids, + sorted_token_ids, + expert_ids, + tokens_cnts, + cumsum, + num_experts, + block_size, + numel, + tokens_per_thread, + ) + + def moe_align_block_size( topk_ids: torch.Tensor, block_size: int, num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: @@ -469,8 +608,28 @@ def moe_align_block_size( num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device) - ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids, - expert_ids, num_tokens_post_pad) + if num_experts >= 224: + if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: + moe_align_block_size_triton( + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) + else: + ops.sgl_moe_align_block_size( + topk_ids, + num_experts, + block_size, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) + else: + ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids, + expert_ids, num_tokens_post_pad) return sorted_ids, expert_ids, num_tokens_post_pad @@ -620,14 +779,13 @@ def get_config_file_name(E: int, device_name = current_platform.get_device_name().replace(" ", "_") dtype_selector = "" if not dtype else f",dtype={dtype}" block_shape_selector = ("" if not block_shape or not all(block_shape) else - f",block_shape={block_shape}") + f",block_shape={block_shape}").replace(" ", "") if not use_nn_moe: return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json" # noqa: E501 else: return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}_nn.json" - # Adapted from: https://github.com/sgl-project/sglang/pull/2628 @functools.lru_cache def get_moe_configs( @@ -785,6 +943,7 @@ def fused_topk( # This is used by the Deepseek-V2 and Deepseek-V3 model +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def grouped_topk(hidden_states: torch.Tensor, gating_output: torch.Tensor, topk: int, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 8eb97aeef474085e5bb8b82c612372c69cf31f0f..c461e7da0af6555050e55efe6fd446b02505fdd8 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from abc import abstractmethod from enum import Enum diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py index 563ee18c64304132606719e4712b7253bf9a9ad8..0365afa10a459ed01185c3f05323c11e8b817e46 100644 --- a/vllm/model_executor/layers/fused_moe/moe_pallas.py +++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import torch.nn.functional as F from torch_xla.experimental.custom_kernel import _histogram diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py index bcff55f4fdf16e6b273c7ba05ffc635eb4e63054..d9a5de1b3033a7088b518ead95523bcd353da845 100644 --- a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import torch.nn.functional as F diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index bba8f490cf766bf805285fb8c0ebf11fc05c534f..59c52774196d315cb6bf31cfe1dc7dc48ac2eb4a 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Custom normalization layers.""" from typing import Optional, Tuple, Union diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 6d0f30ed07df9a163a5b870141fb2247302fd097..49a6f1e927dae4843cacba400a94024a9363ad96 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -1,6 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from abc import abstractmethod -from typing import Dict, List, Optional, Tuple +from typing import Optional import torch import torch.nn.functional as F @@ -48,8 +50,8 @@ def adjust_marlin_shard(param, shard_size, shard_offset): def adjust_bitsandbytes_4bit_shard(param: Parameter, - shard_offsets: Dict[str, Tuple[int, int]], - loaded_shard_id: str) -> Tuple[int, int]: + shard_offsets: dict[str, tuple[int, int]], + loaded_shard_id: str) -> tuple[int, int]: """Adjust the quantization offsets and sizes for BitsAndBytes sharding.""" total, _ = shard_offsets["total"] @@ -91,7 +93,7 @@ class LinearMethodBase(QuantizeMethodBase): @abstractmethod def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): """Create weights for a linear layer. @@ -128,7 +130,7 @@ class UnquantizedLinearMethod(LinearMethodBase): def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, + output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): weight = Parameter(torch.empty(sum(output_partition_sizes), @@ -196,7 +198,8 @@ class LinearBase(torch.nn.Module): self.quant_method = quant_config.get_quant_method(self, prefix=prefix) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, + x: torch.Tensor) -> tuple[torch.Tensor, Optional[Parameter]]: raise NotImplementedError @@ -257,9 +260,8 @@ class ReplicatedLinear(LinearBase): assert param.size() == loaded_weight.size() param.data.copy_(loaded_weight) - def forward( - self, x: torch.Tensor - ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + def forward(self, + x: torch.Tensor) -> tuple[torch.Tensor, Optional[Parameter]]: bias = self.bias if not self.skip_bias_add else None assert self.quant_method is not None output = self.quant_method.apply(self, x, bias) @@ -305,7 +307,7 @@ class ColumnParallelLinear(LinearBase): skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, - output_sizes: Optional[List[int]] = None, + output_sizes: Optional[list[int]] = None, prefix: str = ""): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix) @@ -391,7 +393,7 @@ class ColumnParallelLinear(LinearBase): loaded_weight = loaded_weight.reshape(1) param.load_column_parallel_weight(loaded_weight=loaded_weight) - def forward(self, input_): + def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]: bias = self.bias if not self.skip_bias_add else None # Matrix multiply. @@ -439,7 +441,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): def __init__(self, input_size: int, - output_sizes: List[int], + output_sizes: list[int], bias: bool = True, gather_output: bool = False, skip_bias_add: bool = False, @@ -517,7 +519,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): current_shard_offset = 0 use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) - shard_offsets: List[Tuple[int, int, int]] = [] + shard_offsets: list[tuple[int, int, int]] = [] for i, output_size in enumerate(self.output_sizes): shard_offsets.append((i, current_shard_offset, output_size)) current_shard_offset += output_size @@ -619,7 +621,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): """ current_shard_offset = 0 - shard_offsets: List[Tuple[int, int, int]] = [] + shard_offsets: list[tuple[int, int, int]] = [] for i, output_size in enumerate(self.output_sizes): shard_offsets.append((i, current_shard_offset, output_size)) current_shard_offset += output_size @@ -1141,7 +1143,7 @@ class RowParallelLinear(LinearBase): param.load_row_parallel_weight(loaded_weight=loaded_weight) - def forward(self, input_): + def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]: if self.input_is_parallel: input_parallel = input_ else: diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 42decde1d0f79069e36bb3d8fa6119c2bd3cd253..cdc67ca83d489353be7c92ed960ca84fe923b2fb 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -1,5 +1,7 @@ +# SPDX-License-Identifier: Apache-2.0 """A layer that compute logits from hidden_stats.""" import inspect +from concurrent.futures import ThreadPoolExecutor from typing import Optional import torch @@ -14,6 +16,11 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.platforms import current_platform +_logits_processor_threadpool: Optional[ThreadPoolExecutor] = None +if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None: + _logits_processor_threadpool = ThreadPoolExecutor( + envs.VLLM_LOGITS_PROCESSOR_THREADS) + class LogitsProcessor(nn.Module): """Process logits and apply logits processors from sampling metadata. @@ -134,6 +141,7 @@ def _apply_logits_processors( ) -> torch.Tensor: found_logits_processors = False logits_processed = 0 + logits_row_ids_and_logits_row_futures = [] for seq_group in sampling_metadata.seq_groups: seq_ids = seq_group.seq_ids sampling_params = seq_group.sampling_params @@ -147,22 +155,39 @@ def _apply_logits_processors( past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids - for logits_processor in logits_processors: - parameters = inspect.signature(logits_processor).parameters - if len(parameters) == 3: - logits_row = logits_processor(prompt_tokens_ids, - past_tokens_ids, - logits_row) - else: - logits_row = logits_processor(past_tokens_ids, - logits_row) - - logits[logits_row_idx] = logits_row + if _logits_processor_threadpool is not None: + logits_row_ids_and_logits_row_futures.append( + (logits_row_idx, + _logits_processor_threadpool.submit( + _apply_logits_processors_single_seq, logits_row, + logits_processors, past_tokens_ids, + prompt_tokens_ids))) + else: + logits[logits_row_idx] = \ + _apply_logits_processors_single_seq( + logits_row, logits_processors, past_tokens_ids, + prompt_tokens_ids) logits_processed += len(seq_group.sample_indices) + len( seq_group.prompt_logprob_indices) + for logits_row_idx, future in logits_row_ids_and_logits_row_futures: + logits[logits_row_idx] = future.result() + if found_logits_processors: # verifies that no rows in logits were missed unexpectedly assert logits_processed == logits.shape[0] return logits + + +def _apply_logits_processors_single_seq(logits_row, logits_processors, + past_tokens_ids, + prompt_tokens_ids) -> torch.Tensor: + for logits_processor in logits_processors: + parameters = inspect.signature(logits_processor).parameters + if len(parameters) == 3: + logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids, + logits_row) + else: + logits_row = logits_processor(past_tokens_ids, logits_row) + return logits_row diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 606c796d503cfb2ce7f1cfe78480f3fefa98da33..93c3cc91bb0929d2e4cf4afc8ba3f5016b42b093 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from torch import nn from torch.nn.parameter import Parameter diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py index be5639df985fa25c88e08867edd318880f4fee88..21e27160f090b26401be7d6dbc922c5e2d50d183 100644 --- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright (c) 2024, Tri Dao. # Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py index 1484b79815ab9d3d8921164176afde169b43d3bb..3c35f1ac0dcf58b940c0accc954a3e7a80766bf5 100644 --- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py +++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/triton/selective_state_update.py diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 75bf33dc70a51a3d2916ebc7a23f5f94131be085..0012636ef9ffccb2a36853989e703eecb13e2c50 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from enum import IntEnum from typing import List, Optional, Union diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index bd0fd4799339610f49a2474be3244172415fa272..6ded3874fc1dd8080aa87c3c15e76597d41ddedb 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List, Type from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 72c89fe2b0e489370e405863f8ca7b42dde5de8b..6c08d016c0f7ba003ff12c28d29f9bdee29ffe45 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Supports AQLM compression, see https://github.com/Vahe1994/AQLM # and https://arxiv.org/pdf/2401.06118.pdf diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index f15554cfba69bb82e2d6710ea693976d6b0ad1df..7c1cd5fa37bc300773afaa0a51aaa3b5b9a5f55f 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 0c3c9816878e9a7a974c2d5703ee637a188fa342..8849ba292822831edde80695f275d574c1869e38 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py index ace8f4a348812a31693af7bc1f6748412f520daf..09efd4dbd79756f3b36f660a3984dd1ebc88c4ff 100644 --- a/vllm/model_executor/layers/quantization/awq_triton.py +++ b/vllm/model_executor/layers/quantization/awq_triton.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch import triton import triton.language as tl diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 2fb2642dd51562c405a4c146f72d29ca3f3093b7..c0d8553c0df1a9d9f151470948a68e698fcc9802 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -1,6 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 + import inspect from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Type +from typing import Any, Dict, List, Mapping, Optional, Type import torch from torch import nn @@ -57,6 +59,7 @@ def method_has_implemented_embedding( class QuantizationConfig(ABC): """Base class for quantization configs.""" + packed_modules_mapping: Mapping[str, List[str]] = dict() @abstractmethod def get_name(self) -> str: diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 5dc872933282c826b66882af6db53634c6c007f6..889eda009df15f1b9686cd778aa45707012f82eb 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 37981ed918e7a2d2877b3b47ec5147c790965e5f..6ee3e9362f8d23ccd12555ea9b0d7a9b228157e0 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from contextlib import suppress from typing import Any, Dict, List, Literal, Optional, Tuple, cast @@ -81,7 +83,9 @@ class CompressedTensorsConfig(QuantizationConfig): # Check if the layer is skipped for quantization. # TODO (@robertgshaw2): support module names - if should_ignore_layer(prefix, ignore=self.ignore): + if should_ignore_layer(prefix, + ignore=self.ignore, + fused_mapping=self.packed_modules_mapping): return UnquantizedLinearMethod() if isinstance(layer, LinearBase): scheme = self.get_scheme(layer=layer, layer_name=prefix) @@ -377,34 +381,29 @@ class CompressedTensorsConfig(QuantizationConfig): # Will be empty for models with only sparsity weight_quant = input_quant = None - sparsity_scheme: Optional[SparsityCompressionConfig] = None if self.target_scheme_map: matched_target = find_matched_target( layer_name=layer_name, module=layer, - targets=self.target_scheme_map.keys()) + targets=self.target_scheme_map.keys(), + fused_mapping=self.packed_modules_mapping) scheme_dict = self.target_scheme_map[matched_target] weight_quant = scheme_dict.get("weights") input_quant = scheme_dict.get("input_activations") - if self.sparsity_scheme_map: - is_ignored = False - with suppress(ValueError): - is_ignored = find_matched_target( - layer_name=layer_name, - module=layer, - targets=self.sparsity_ignore_list) - - # if the layer is in the sparsity ignore list, - # we should not apply any sparsity scheme - - if not is_ignored: - matched_target = find_matched_target( - layer_name=layer_name, - module=layer, - targets=self.sparsity_scheme_map.keys()) - sparsity_scheme = self.sparsity_scheme_map.get(matched_target) + # Find the sparsity scheme of the layer + # assume that fused layers inerhit first component's sparsity scheme + sparsity_targets = (self.sparsity_scheme_map.keys() - + set(self.sparsity_ignore_list)) + sparsity_scheme: Optional[SparsityCompressionConfig] = None + with suppress(ValueError): + matched_target = find_matched_target( + layer_name=layer_name, + module=layer, + targets=sparsity_targets, + fused_mapping=self.packed_modules_mapping) + sparsity_scheme = self.sparsity_scheme_map[matched_target] if self.supports_cutlass_24(weight_quant=weight_quant, input_quant=input_quant, @@ -418,10 +417,22 @@ class CompressedTensorsConfig(QuantizationConfig): return None # Have a valid sparsity scheme # Validate layer is supported by Cutlass 2:4 Kernel - scheme = CompressedTensors24(quantized=weight_quant is not None - or input_quant is not None, - weight_quant=weight_quant, - input_quant=input_quant) + model_compression_config = (None if sparsity_scheme is None + or sparsity_scheme.format == "dense" + else self.config) + + scheme = CompressedTensors24( + quantized=weight_quant is not None or input_quant is not None, + weight_quant=weight_quant, + input_quant=input_quant, + model_compression_config=model_compression_config, + ) + elif weight_quant is None: + logger.warning_once("Acceleration for non-quantized schemes is " + "not supported by Compressed Tensors. " + "Falling back to UnquantizedLinearMethod") + return None + else: # Find the quant_scheme scheme = self._get_scheme_from_parts( # type: ignore @@ -471,10 +482,21 @@ class CompressedTensorsConfig(QuantizationConfig): :return: True if the layer is supported by the Cutlass 2:4 Kernel False otherwise """ - is_valid_sparsity = (sparsity_scheme is not None - and sparsity_scheme.sparsity_structure - == SparsityStructure.TWO_FOUR.value - and sparsity_scheme.format == "dense") + if sparsity_scheme is None: + return False + + is_valid_sparsity_structure: bool = ( + sparsity_scheme.sparsity_structure == + SparsityStructure.TWO_FOUR.value) + + valid_compressors = { + CompressionFormat.dense.value, + CompressionFormat.sparse_24_bitmask.value + } + + is_valid_sparsity = (is_valid_sparsity_structure + and sparsity_scheme.format in valid_compressors) + if not is_valid_sparsity: return False diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index e1c45f4e42e41332af57df8e661edafc06b8b99a..db8e8a4b6c11a1fca16954ca64b9090f6a1be283 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum from enum import Enum from typing import Callable, List, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index 569ecaa6f5a76b4d4aae4c64470193ba4f129e5d..b26c74f2484b61da8f6e40b6b3be7daa67f91b1e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .compressed_tensors_scheme import CompressedTensorsScheme from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS, CompressedTensorsW4A16Sparse24) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py index 21e6fe7a22616c38ab17e009796982aa6814d3ff..0fb8dfa96a19cfdcf780bae0e091b4c084e4a32c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py @@ -1,11 +1,17 @@ -from typing import Callable, List, Optional +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Callable, Dict, List, Optional, Tuple import torch +from compressed_tensors import CompressionFormat, ModelCompressor from compressed_tensors.quantization import (QuantizationArgs, QuantizationStrategy, QuantizationType) +from compressed_tensors.utils import combine_shards from vllm import _custom_ops as ops +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( @@ -20,26 +26,39 @@ __all__ = ["CompressedTensors24"] class CompressedTensors24(CompressedTensorsScheme): - def __init__(self, - quantized: bool = False, - weight_quant: Optional[QuantizationArgs] = None, - input_quant: Optional[QuantizationArgs] = None): - + def __init__( + self, + quantized: bool = False, + weight_quant: Optional[QuantizationArgs] = None, + input_quant: Optional[QuantizationArgs] = None, + model_compression_config: Optional[Dict[str, Any]] = None, + ): self.quantized = quantized self.weight_quant = weight_quant self.input_quant = input_quant + self.model_compressor = ( + ModelCompressor.from_compression_config(model_compression_config) + if model_compression_config is not None else None) + self.do_sparse_decompress = ( + self.model_compressor is not None + and self.model_compressor.sparsity_config.format + == CompressionFormat.sparse_24_bitmask.value) @classmethod def get_min_capability(cls) -> int: # Only cutlass 3.x kernels are implemented so far return 90 - def create_weights(self, layer: torch.nn.Module, input_size: int, - output_partition_sizes: List[int], - input_size_per_partition: int, - params_dtype: torch.dtype, weight_loader: Callable, - **kwargs): - + def create_weights( + self, + layer: torch.nn.Module, + input_size: int, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, + weight_loader: Callable, + **kwargs, + ): if not sparse_cutlass_supported(): raise ValueError( "Sparse CUTLASS not supported. vLLM must be built with " @@ -47,16 +66,56 @@ class CompressedTensors24(CompressedTensorsScheme): self.output_dtype = params_dtype layer.logical_widths = output_partition_sizes + layer.input_size = input_size + layer.input_size_per_partition = input_size_per_partition self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype) # parameter to store uncompressed weight - weight = ModelWeightParameter(data=torch.empty( - sum(output_partition_sizes), - input_size_per_partition, - dtype=self.weights_dtype), - input_dim=1, - output_dim=0, - weight_loader=weight_loader) + weight = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=self.weights_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + if self.do_sparse_decompress: + assert all(partition_size % 8 == 0 + for partition_size in output_partition_sizes + ), "All partitions must be divisible by 8 for " + "2:4 sparse compressed models" + + shape = BasevLLMParameter( + data=torch.empty(2, 1, dtype=torch.int64), + weight_loader=weight_loader, + ) + compressed_weight = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition // 2, + dtype=self.weights_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + bitmask = ModelWeightParameter( + data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition // 8, + dtype=torch.uint8, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + + layer.register_parameter("shape", shape) + layer.register_parameter("compressed", compressed_weight) + layer.register_parameter("bitmask", bitmask) # Check if quantized, not just 2:4 Sparse if self.quantized: @@ -66,14 +125,16 @@ class CompressedTensors24(CompressedTensorsScheme): data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32), output_dim=0, - weight_loader=weight_loader) + weight_loader=weight_loader, + ) else: assert (self.weight_quant and self.weight_quant.strategy == QuantizationStrategy.TENSOR.value) weight_scale = PerTensorScaleParameter( data=torch.empty(len(output_partition_sizes), dtype=torch.float32), - weight_loader=weight_loader) + weight_loader=weight_loader, + ) layer.register_parameter("weight_scale", weight_scale) @@ -82,9 +143,10 @@ class CompressedTensors24(CompressedTensorsScheme): # register input quant scale assert (self.input_quant.strategy == QuantizationStrategy.TENSOR.value) - input_scale = BasevLLMParameter(data=torch.empty( - 1, dtype=torch.float32), - weight_loader=weight_loader) + input_scale = BasevLLMParameter( + data=torch.empty(1, dtype=torch.float32), + weight_loader=weight_loader, + ) layer.register_parameter("input_scale", input_scale) @@ -105,13 +167,25 @@ class CompressedTensors24(CompressedTensorsScheme): """ Compress weights after loading. Store compressed weight and meta tensor - + :post-condition: layer.w_compressed and layer.meta are set to the compressed weight and meta tensor in the format expected by the Cutlass kernels :param layer: The layer with the weights to be processed - + """ + if self.do_sparse_decompress: + layer.weight.data = self._decompress_bitmask_compressed_weight( + compressed=layer.compressed, + bitmask=layer.bitmask, + layer=layer, + ) + + # compressed and bitmask tensors + # are no longer needed after decompression + del layer.compressed + del layer.bitmask + # torch.compile workaround if hasattr(layer, "input_scale"): layer.input_scale = torch.nn.Parameter(layer.input_scale.data, @@ -119,10 +193,13 @@ class CompressedTensors24(CompressedTensorsScheme): if self.weight_quant: if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value: - layer.weight_scale = torch.nn.Parameter(convert_to_channelwise( - weight_scale=layer.weight_scale, - logical_widths=layer.logical_widths), - requires_grad=False) + layer.weight_scale = torch.nn.Parameter( + convert_to_channelwise( + weight_scale=layer.weight_scale, + logical_widths=layer.logical_widths, + ), + requires_grad=False, + ) else: # torch.compile workaround layer.weight_scale = torch.nn.Parameter( @@ -132,20 +209,22 @@ class CompressedTensors24(CompressedTensorsScheme): layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False) layer.meta = torch.nn.Parameter(meta, requires_grad=False) - def apply_weights(self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: """ - Returns the output tensor for the layer with 2:4 + Returns the output tensor for the layer with 2:4 sparse compressed weights, given the input tensor and bias - :param layer: The layer with 2:4 sparse compressed + :param layer: The layer with 2:4 sparse compressed weights to be used for the computation :param x: The input tensor to the layer :param bias: The bias to be added to the output tensor - :return: The output tensor of the layer + :return: The output tensor of the layer """ if self.quantized: scale = None @@ -169,13 +248,15 @@ class CompressedTensors24(CompressedTensorsScheme): input_scale = layer.input_scale q_input = x - out = ops.cutlass_scaled_sparse_mm(a=q_input, - bt_nzs=layer.weight, - bt_meta=layer.meta, - scale_a=input_scale, - scale_b=layer.weight_scale, - out_dtype=self.output_dtype, - bias=bias) + out = ops.cutlass_scaled_sparse_mm( + a=q_input, + bt_nzs=layer.weight, + bt_meta=layer.meta, + scale_a=input_scale, + scale_b=layer.weight_scale, + out_dtype=self.output_dtype, + bias=bias, + ) assert out.is_contiguous() return out @@ -201,8 +282,71 @@ class CompressedTensors24(CompressedTensorsScheme): raise ValueError("Quantization type not supported by Cutlass") + def _decompress_bitmask_compressed_weight( + self, + compressed: torch.Tensor, + bitmask: torch.Tensor, + layer: torch.nn.Module, + ) -> torch.Tensor: + """ + Decompress a compressed 2:4 sparse weight tensor using the bitmask and + return the result. + + This function also supports sharded decompression. + + :param compressed: The 2:4 sparse weight tensor compressed using the + sparse-24-bitmask compressor. This is different from + `cutlass_sparse_compress` which uses a different scheme (2 bits for + every nonzero element that represent the coordinate within the block + of 4). The bitmask compression here uses a bitmask to indicate the + positions of non-zero elements. + :param bitmask: The 2:4 bitmask associated with the compressed weights, + representing the positions of non-zero elements in the compressed + tensor. + :param layer: The layer whose weights need to be processed after + loading. + :return: The decompressed 2:4 sparse weight tensor. + """ -def check_24(tensor): - new_tensor = tensor.view(-1, 4) - zero_counts = (new_tensor == 0).sum(dim=1) - return (zero_counts >= 2).all().item() + sparsity_compressor = self.model_compressor.sparsity_compressor + + def _process_split( + bitmask_compressed_weight: torch.Tensor, + shape, + bitmask: torch.Tensor, + ) -> torch.Tensor: + weight_data = dict( + compressed=bitmask_compressed_weight, + shape=shape, + bitmask=bitmask, + ) + return sparsity_compressor.decompress_weight(weight_data) + + split_weights: List[torch.Tensor] = [] + split_bitmask: List[torch.Tensor] = [] + split_shape: List[Tuple[int, int]] = [] + + if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)): + split_weights = torch.split(compressed, layer.logical_widths) + split_bitmask = torch.split(bitmask, layer.logical_widths) + split_shape = [(out, layer.input_size_per_partition) + for out in layer.logical_widths] + + if split_weights: + decompressed_shards = [ + _process_split(compressed_weight, shape, bitmask) + for compressed_weight, shape, bitmask in zip( + split_weights, split_shape, split_bitmask) + ] + decompressed = combine_shards(decompressed_shards) + else: + decompressed = sparsity_compressor.decompress_weight( + dict( + compressed=compressed, + shape=( + layer.logical_widths[0], + layer.input_size_per_partition, + ), + bitmask=bitmask, + )) + return decompressed diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py index b4bab33e1fb1d06eca6e22295b62e4980e11ce38..daa25d23a3060a5daa527d8c4b8c669043afcdf5 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index 2e1b5e3c2d3b14d6973c3262005618b3b2a0bd5d..535ea6b32cfbf6784fd55aa3cebc263952a2e869 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index 1671a23d77c6314d31496fe202b1da08ba611128..5c8261908735f727084e906ce6039362ae173237 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 1d4e4bd52adaa2ceeb3769112438dede9cd3fe0a..5dcc41a9e5dab142b3f0ed703ec40cd5ce5fc008 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 76f81da98c15c69616f797f38765a4865d516ced..b642a11b3a9bd0840c245c7e22214af1affdb9da 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional, Set import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index 2dd243b9c31096ee5a3801e5e7febd8f88c802e2..38df09ff39373fc33e1fe799c5feca2ecc2b4c25 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional, Set import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py index f4c1dbc0361c6db481309c614f666d5702ca0f34..b69c5e7a02a7233c11c25c7c21d3a0727c0c9cba 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Type import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 34996b08e9c91f7df04e1ec58b789891b121c99c..85ae1d5cb7878fb29a04e2db79d347df45312ae8 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,12 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 + import re -from typing import Iterable, Optional +from types import MappingProxyType +from typing import Iterable, List, Mapping, Optional from compressed_tensors import CompressionFormat from torch.nn import Module -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - FUSED_LAYER_NAME_MAPPING) - def is_activation_quantization_format(format: str) -> bool: _ACTIVATION_QUANTIZATION_FORMATS = [ @@ -17,8 +17,11 @@ def is_activation_quantization_format(format: str) -> bool: return format in _ACTIVATION_QUANTIZATION_FORMATS -def should_ignore_layer(layer_name: Optional[str], - ignore: Iterable[str]) -> bool: +def should_ignore_layer( + layer_name: Optional[str], + ignore: Iterable[str] = tuple(), + fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) +) -> bool: if layer_name is None: return False @@ -30,8 +33,8 @@ def should_ignore_layer(layer_name: Optional[str], # in the safetensors checkpoint. So, we convert the name # from the fused version to unfused + check to make sure that # each shard of the fused layer has the same scheme. - if proj_name in FUSED_LAYER_NAME_MAPPING and layer_name not in ignore: - shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] + if proj_name in fused_mapping and layer_name not in ignore: + shard_proj_names = fused_mapping[proj_name] # Convert fused_name --> [shard_names] shard_names = [ @@ -77,55 +80,12 @@ def check_equal_or_regex_match(layer_name: str, return False -def _handle_fused_layers(func): - """ - Decorator to handle fused layers by mapping vllm fused layer names - to their corresponding unfused layer names for quantization/pruning schemes. - """ - # fused_layer_name -> unfused_layer_name - fused_layer_map = { - "qkv_proj": "q_proj", - "gate_up_proj": "up_proj", - } - - def fused_layer_handler(layer_name: Optional[str], module: Module, - targets: Iterable[str]) -> Optional[str]: - """ - Wrapper function specifically designed to support the - find_matched_target function. - - It handles cases where the provided layer name corresponds to a - fused layer in vllm, mapping it to its equivalent unfused layer name - based on the predefined fused_layer_map. If the original layer name - raises a ValueError in the wrapped function, this handler - will attempt to resolve the issue by substituting with unfused - layer name. - - :param layer_name: Name of the layer, which may be fused. - :param module: An instance of torch.nn.Module. - :param targets: A list of target names or patterns to match. - :return: The result of the wrapped find_matched_target function with - the resolved layer name. - :raises ValueError: If the layer name cannot be resolved to a - valid target. - """ - try: - return func(layer_name, module, targets) - except ValueError: - if layer_name is None: - layer_name = "" - parent_name, fused_proj_name = layer_name.rsplit(".", 1) - unfused_proj_name = fused_layer_map.get(fused_proj_name, - fused_proj_name) - new_layer_name = f"{parent_name}.{unfused_proj_name}" - return func(new_layer_name, module, targets) - - return fused_layer_handler - - -@_handle_fused_layers -def find_matched_target(layer_name: Optional[str], module: Module, - targets: Iterable[str]) -> str: +def find_matched_target( + layer_name: Optional[str], + module: Module, + targets: Iterable[str], + fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) +) -> str: """ Helper function to look up which "target" in the compressed-tensors config that a layer corresponds to. @@ -139,19 +99,25 @@ def find_matched_target(layer_name: Optional[str], module: Module, First, we try to match the layer_name with a target Second, we try to match the module's name with a target + Third, we try to map the layer_name to a list of fused module names. + *All* component module names must match in order for a match to be + successful. A successful match returns the first component target :param layer_name: layer name :param module: torch.nn.Module :param targets: list of targets to match the layer against + :param fused_mapping: map from fused layer names to its components + :param fused_strategy: either "all" or "any". If using "all", fused + layers match if "all" of its components match """ if layer_name is None: layer_name = "" - matched_target = (_find_first_match(layer_name, targets) - or _find_first_match(module.__class__.__name__, targets, - True) - or _match_fused_layer(layer_name, targets)) + matched_target = ( + _find_first_match(layer_name, targets) + or _find_first_match(module.__class__.__name__, targets, True) + or _match_fused_layer(layer_name, targets, fused_mapping)) if matched_target is None: raise ValueError( @@ -203,11 +169,19 @@ def _is_equal_or_regex_match(value: str, return False -def _match_fused_layer(layer_name: str, - target_layers: Iterable[str]) -> Optional[str]: +def _match_fused_layer( + layer_name: str, target_layers: Iterable[str], + fused_mapping: Mapping[str, List[str]]) -> Optional[str]: """ Match a fused layer name to its corresponding individual layer in - target_layers. + target_layers. Returns first value in fused_mapping which matches targets + + Implements an "all" matching strategy where a fused layer matches iff + "all" of its components match + + :param layer_name: layer name + :param target_layers: list of targets to match the layer against + :param fused_mapping: map from fused layer names to its components Examples: layer_name = "model.layers.0.self_attn.qkv_proj" @@ -215,27 +189,25 @@ def _match_fused_layer(layer_name: str, "model.layers.0.self_attn.k_proj", "model.layers.0.self_attn.v_proj"] """ - # Split into parent path and layer type - # e.g., "model.layers.0.self_attn" and "qkv_proj" - parent_path = ".".join(layer_name.split(".")[:-1]) - layer_type = layer_name.split(".")[-1] - - if layer_type not in FUSED_LAYER_NAME_MAPPING: + # find layer_name in mapping + fused = next((key for key in fused_mapping if layer_name.endswith(key)), + None) + if fused is None: return None - possible_layer_types = FUSED_LAYER_NAME_MAPPING[layer_type] - - # Look for a target layer that: - # 1. Has the same parent path - # 2. Ends with one of the possible individual layer types - for target in target_layers: - is_same_parent = parent_path in target - is_matching_type = any(type_suffix in target - for type_suffix in possible_layer_types) - - if is_same_parent and is_matching_type and all( - '.'.join([parent_path, type_suffix]) - for type_suffix in possible_layer_types): - return target + # expand path of unfused components + unfused_paths = [ + layer_name.replace(fused, unfused) for unfused in fused_mapping[fused] + ] - return None + # for each unfused component, find a match in targets + unfused_matches: List[Optional[str]] = [] + for unfused in unfused_paths: + for target in target_layers: + if _is_equal_or_regex_match(unfused, target): + unfused_matches.append(target) + break + else: + unfused_matches.append(None) + + return unfused_matches[0] if all(unfused_matches) else None diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py index 36598b3e2990ffb57828635c7046e092a2af9842..b4123650149f029644b5ebd838a3112d49c68f6d 100644 --- a/vllm/model_executor/layers/quantization/deepspeedfp.py +++ b/vllm/model_executor/layers/quantization/deepspeedfp.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 100cbfa4c9598310968ad3a05984339e2b7c797f..87fbcf62ac1edc5768609468c9ec3f2b0ac37fcb 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 7b71e13b50ccc17ac1cffc1ea8627410f7633164..da5ef36c51054076dc1b4ad6295c9d98fd5dace5 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index adab1973b40ee5260c0d954ec17f6d3a967fba84..86e025310f4efc3af4dd07f9db484a99264f0ea2 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index f0943efa0039dbfcd79f907a01ce6975fc3fe9c9..86e6dbb5a5fbe2e4041478cd7c772200f69ea7c5 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import gguf diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index abafad0f1047e54901c84f871a0127ebac12d79d..0cb77a7546d1acad36f3a807a13d653354f643ea 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum from enum import Enum from fractions import Fraction diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 4dc4b052b0410d1d735ebc46b908a0b15487187f..99ab299958b4df3d0e555922996aa3925909f43b 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, List, Optional, Set, Union import torch diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py index 07552c0f1334880f83e8d41f22bfac782d4ce7a7..cec984483fd8cab66892ceaa88a7fa0b49beca90 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py index 28538d299335547fb7470512eb8636f9712c0bad..432f43688ff58f608f670abd6ce9e347fbcf147e 100644 --- a/vllm/model_executor/layers/quantization/hqq_marlin.py +++ b/vllm/model_executor/layers/quantization/hqq_marlin.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index c16a962134d069939ca79308e2b33bdc544fd76b..2531170ececf98a663e0e528ef221326e5a1f938 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py index 915bdc4778929af4e064909735472699386cb03e..c06befaf3b5ad877a9949741916a829138ed90ef 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Callable, Optional, Tuple diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py index 83549870e3f0b796be1eaac322a97a084df91deb..bcfdb1677716656d1de72159bddeba967cee5aed 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Type import vllm.envs as envs diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py index 1d85d62ec83ee76859dd986a3b010f9d671790e4..2706fbb539ab4e7d6c54526b5f51f930b57841a5 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple import torch diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py index 15df0200f30b5de824d37f296337a1b1bbd33e15..3f0586f6e30d6a02f95a36196b88f96d16a1a15e 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import partial from typing import Optional, Tuple diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py index 6969583d6d4733e8a9ddee7793ffdaf5f2a38b00..e21801cf6a7857ae700c30f2f0d15484993e2044 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple import torch diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py index c4a83b4faafe698c6665227354ffb7c403e1dfb8..91e7654053f9d1ad1b8a5539f41a4d30a36a891e 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Optional, Tuple diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index 4824a11804163a2edb4ca29fbdd3a4a104b398e1..a5967995ac88d8e18d69adff5910fe671492a1a2 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Dict, List, Optional, Type diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index 2e83a04286a0d1f85dc88c8d1d752d9d1d16738d..2bf21a05c46d9ef614ec8b25eeed75038e579fb5 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple import torch diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py index 97ec8cb0500d7002ee40283f2fd9de1124866f64..5da5df8efaeb0e55e48871fa105ae437968a4d8d 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Tuple import torch diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py index 9de668e65882638439593a400657668b038c7a33..0bf090d7fab3ca13d7d5109e7461633a0b736629 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import warnings from typing import Optional, Tuple diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index e1870c73cc932fdff78e3c1649699a2f21d294d1..388a4f16699c53abed388d28500d2e054a9cf753 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import torch from vllm.logger import init_logger diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 20212e672eab0b1331a94564526f802cdd581555..4cf0c677c0794ec37303a84500ee5df4d5aace92 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index a1b3eeb43cbee284bdf6c7bc74be90c35f0ee077..348e9bccd9b0aaf83d598334eb1d2a131b77ed71 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 8cd9c0a7ef253e564ebf4fb253d754305641d1fe..56fa597e201316ee306b7dabd834ede5d07dfdd3 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, List, Optional import torch @@ -5,17 +7,15 @@ import torch from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) -from vllm.model_executor.layers.linear import UnquantizedLinearMethod -from vllm.model_executor.layers.quantization.awq import (AWQConfig, - AWQLinearMethod) -from vllm.model_executor.layers.quantization.awq_marlin import ( - AWQMarlinConfig, AWQMarlinLinearMethod) +from vllm.model_executor.layers.linear import (LinearBase, + UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization.awq import AWQConfig +from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) -from vllm.model_executor.layers.quantization.gptq import (GPTQConfig, - GPTQLinearMethod) +from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.gptq_marlin import ( - GPTQMarlinConfig, GPTQMarlinLinearMethod) + GPTQMarlinConfig) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform @@ -126,25 +126,26 @@ class MoeWNA16Config(QuantizationConfig): prefix: str) -> Optional["QuantizeMethodBase"]: if is_layer_skipped_quant(prefix, self.modules_to_not_convert): return UnquantizedLinearMethod() - elif isinstance(layer, FusedMoE): - return MoeWNA16Method(self) - else: + elif isinstance(layer, LinearBase): if self.linear_quant_method == "gptq": if self.use_marlin: - return GPTQMarlinLinearMethod( - GPTQMarlinConfig.from_config(self.full_config)) + return GPTQMarlinConfig.from_config( + self.full_config).get_quant_method(layer, prefix) else: - return GPTQLinearMethod( - GPTQConfig.from_config(self.full_config)) + return GPTQConfig.from_config( + self.full_config).get_quant_method(layer, prefix) elif self.linear_quant_method == "awq": if self.use_marlin: - return AWQMarlinLinearMethod( - AWQMarlinConfig.from_config(self.full_config)) + return AWQMarlinConfig.from_config( + self.full_config).get_quant_method(layer, prefix) else: - return AWQLinearMethod( - AWQConfig.from_config(self.full_config)) + return AWQConfig.from_config( + self.full_config).get_quant_method(layer, prefix) else: raise ValueError("moe_wna16 only support gptq and awq.") + elif isinstance(layer, FusedMoE): + return MoeWNA16Method(self) + return None def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]): diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py index 2d5cdfa165775c843dc4f95608f7cc37d23b7999..a8e8be207fd15f76a040bca09f3fdb93c5b4295c 100644 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from importlib.util import find_spec from typing import Any, Dict, List, Optional diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py index 2ccd08202961014ab42d42154c4988ba713cef41..6e9d3dc6cb378096cdc3fb186ee40dface4e6124 100644 --- a/vllm/model_executor/layers/quantization/qqq.py +++ b/vllm/model_executor/layers/quantization/qqq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index fc214255eca71d8a623c128e087b9d34faf84189..ba123565a0ecc70396946da3ec4267081e11ea1e 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import fnmatch import re from typing import Any, Dict, List, Optional, cast @@ -16,8 +18,6 @@ from vllm.model_executor.layers.quantization.quark.schemes import ( QuarkScheme, QuarkW8A8Fp8, QuarkW8A8Int8) from vllm.model_executor.layers.quantization.quark.utils import ( deep_compare, should_ignore_layer) -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - FUSED_LAYER_NAME_MAPPING) from vllm.platforms import current_platform __all__ = ["QuarkLinearMethod"] @@ -56,7 +56,9 @@ class QuarkConfig(QuantizationConfig): # Check if the layer is skipped for quantization. exclude_layers = cast(List[str], self.quant_config.get("exclude")) - if should_ignore_layer(prefix, ignore=exclude_layers): + if should_ignore_layer(prefix, + ignore=exclude_layers, + fused_mapping=self.packed_modules_mapping): return UnquantizedLinearMethod() if isinstance(layer, LinearBase): scheme = self.get_scheme(layer=layer, layer_name=prefix) @@ -199,8 +201,8 @@ class QuarkConfig(QuantizationConfig): module: torch.nn.Module) -> Dict[str, Any]: proj_name = layer_name.split(".")[-1] - if proj_name in FUSED_LAYER_NAME_MAPPING: - shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] + if proj_name in self.packed_modules_mapping: + shard_proj_names = self.packed_modules_mapping[proj_name] # Convert fused_name --> [shard_names] shard_names = [ diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 68a395454076343aa386813f5c29f65511b83844..98743b15e4b25a5b5e6d78a8e9c31fb5bbba84c2 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Callable, Dict, Optional import torch diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py index fb0ba9bd5220c01b52d28d49a3c7cf0cf2601fbf..9069b5a0d515d78eb5d3f68b0fb162f5292db8ec 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .quark_scheme import QuarkScheme from .quark_w8a8_fp8 import QuarkW8A8Fp8 from .quark_w8a8_int8 import QuarkW8A8Int8 diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py index 239597fa4be0e75c16704619ef2b1228f7d0b002..40c8ea86d3c385417f7810c774b5ebe85baf3a02 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Optional diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py index 206931ea2ffc0e4a59fd3321c507886f1fda6bcb..c885e98a4d66e2c7b24579751da081c2f304b66d 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional import torch diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py index 8cb47e9c37e56a5eeca47a8fef9581a95c7c6ba2..1bf34b098938c1282e4ab597db849f03a4ee610f 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Callable, List, Optional, Set import torch diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py index 742a629bdb1c5284c2107456ed74b2fd4e1a7d47..17e0df021085a9eff8334419e438dfeafe151b5e 100644 --- a/vllm/model_executor/layers/quantization/quark/utils.py +++ b/vllm/model_executor/layers/quantization/quark/utils.py @@ -1,8 +1,8 @@ -import re -from typing import Any, Iterable, Optional +# SPDX-License-Identifier: Apache-2.0 -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - FUSED_LAYER_NAME_MAPPING) +import re +from types import MappingProxyType +from typing import Any, Iterable, List, Mapping, Optional def deep_compare(dict1: Any, dict2: Any) -> bool: @@ -18,8 +18,11 @@ def deep_compare(dict1: Any, dict2: Any) -> bool: return dict1 == dict2 -def should_ignore_layer(layer_name: Optional[str], - ignore: Iterable[str]) -> bool: +def should_ignore_layer( + layer_name: Optional[str], + ignore: Iterable[str], + fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) +) -> bool: if layer_name is None: return False @@ -31,8 +34,8 @@ def should_ignore_layer(layer_name: Optional[str], # in the safetensors checkpoint. So, we convert the name # from the fused version to unfused + check to make sure that # each shard of the fused layer has the same scheme. - if proj_name in FUSED_LAYER_NAME_MAPPING: - shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] + if proj_name in fused_mapping: + shard_proj_names = fused_mapping[proj_name] # Convert fused_name --> [shard_names] shard_names = [ diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py index a26c524787a0be7094ce8109354055b109ea98ac..026881f2dbaac6d5831ce65122675eaad7674188 100644 --- a/vllm/model_executor/layers/quantization/schema.py +++ b/vllm/model_executor/layers/quantization/schema.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ This file contains the Pydantic schemas for various quantization-related parameters. When a relevant quantization technique is specified, these diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py index 605c3a38644ac268a37b3e9be8908f1d00b5a0bd..3234fecaa3b35803e468b4a2510d1bffd598589e 100644 --- a/vllm/model_executor/layers/quantization/tpu_int8.py +++ b/vllm/model_executor/layers/quantization/tpu_int8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional, Tuple import torch diff --git a/vllm/model_executor/layers/quantization/utils/__init__.py b/vllm/model_executor/layers/quantization/utils/__init__.py index e60f0c79ac1f7f46d11eec3c8cd903822f15e5ef..f7ee4728851408d2fe796b3910e04d270ba0faeb 100644 --- a/vllm/model_executor/layers/quantization/utils/__init__.py +++ b/vllm/model_executor/layers/quantization/utils/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .layer_utils import replace_parameter, update_tensor_inplace __all__ = ['update_tensor_inplace', 'replace_parameter'] diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..77ba0d7477bdbcb036a43263e7aaa6b6913f8f4e --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..0a5d7bfdba4852da9ed08d1bc27cd7d521d09965 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..cb91a279d423d0ca25197e0edd5e8c2f4da58720 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..7febe3d272b4bb76500f7c6b523396129fd53680 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..9d7658bfc41b2c8fd4daf3fbdf62d15936d3d546 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..03dba5ad15ba5f7f49100a5c78e8685e64334b2a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..9a5ff48b8942957dde9b862aed848390dd267948 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..386928de139ce718f28222b9c1a6555df3958491 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..9c908e80406587da4d246ce4e3a8a98a14c875b1 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..f78e7060e6840ff721d306db556636b0bbc8d9b3 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 0000000000000000000000000000000000000000..3cb7eaa07c745fd3aa2b3242780a7061bedac1de --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "32": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 850820f66ff90a0aeb5e203beb55e10dcfee3586..9895537c219ab6875546ae91f63f695d0ec392f1 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from https://github.com/sgl-project/sglang/pull/2575 import functools import json @@ -13,7 +15,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( _normalize_quant_group_shape, scaled_dequantize) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - apply_fp8_linear) + CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear) from vllm.platforms import current_platform logger = init_logger(__name__) @@ -36,7 +38,7 @@ def apply_w8a8_block_fp8_linear( weight_scale: torch.Tensor, input_scale: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, - cutlass_block_fp8_supported: bool = True, + cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED, ) -> torch.Tensor: assert input_scale is None # View input as 2D matrix for fp8 methods @@ -45,6 +47,16 @@ def apply_w8a8_block_fp8_linear( shape_supported_by_cutlass = (weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0) + if current_platform.is_rocm(): + scale_a_shape = ((input_2d.shape[-1] // block_size[1], ) + + input_2d.shape[:-1])[::-1] + scale_b_shape = (weight_scale.view(-1, 1) + if weight_scale.dim() <= 1 else weight_scale.T).shape + ar, ac = scale_a_shape + br, bc = scale_b_shape + if (ac > 1 or bc > 1 or ar not in (1, input_2d.shape[0]) + or br not in (1, weight.shape[0])): + shape_supported_by_cutlass = False if cutlass_block_fp8_supported and shape_supported_by_cutlass: q_input, x_scale = per_token_group_quant_fp8(input_2d, block_size[1], @@ -73,12 +85,14 @@ def apply_w8a8_block_fp8_linear( # `apply_fp8_linear` # NOTE(lucas): this is quite messy, we should think through this more formally def apply_fp8_linear_generic( - input: torch.Tensor, - weight: torch.Tensor, - weight_scale: torch.Tensor, - input_group_shape: Tuple[int, int], - weight_group_shape: Tuple[int, int], - input_scale: Optional[torch.Tensor] = None, # static scale if one + input: torch.Tensor, + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_group_shape: Tuple[int, int], + weight_group_shape: Tuple[int, int], + input_scale: Optional[torch.Tensor] = None, # static scale if one + cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED, + cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED, ) -> torch.Tensor: # View input as 2D matrix for fp8 methods input = input.view(-1, input.shape[-1]) @@ -93,14 +107,18 @@ def apply_fp8_linear_generic( if is_dim_blocked(0, weight.shape, weight_group_shape[0])\ and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\ input_group_shape == (1, weight_group_shape[1]): - return apply_w8a8_block_fp8_linear(input, weight, - list(weight_group_shape), - weight_scale) + return apply_w8a8_block_fp8_linear( + input, + weight, + list(weight_group_shape), + weight_scale, + cutlass_block_fp8_supported=cutlass_block_fp8_supported) else: # Despite having linear in the it doesn't conform to # `torch.nn.functional.linear` which is defined as `input @ weight.T` # so we explicitly transpose the weight matrix here return apply_fp8_linear(input, weight.T, weight_scale.T, + cutlass_fp8_supported=cutlass_fp8_supported, use_per_token_if_dynamic=\ (input_group_shape == (1, input.shape[1]))) @@ -405,7 +423,7 @@ def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int, # First look up if an optimized configuration is available in the configs # directory device_name = current_platform.get_device_name().replace(" ", "_") - json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n}, {block_k}].json" # noqa: E501 + json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n},{block_k}].json" # noqa: E501 config_file_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name) diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py index edce6d19b6c49b651e0db42a76e0a1a98c355765..5acae7ca3b84f2047608a7688a3271a37d81331f 100644 --- a/vllm/model_executor/layers/quantization/utils/layer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/layer_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Union import torch diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py index 18e1332050cdd4b2deea905055dbb780ddeedc1f..cb7d49ed6f1ca046d58bf6e5c333130963a713a0 100644 --- a/vllm/model_executor/layers/quantization/utils/machete_utils.py +++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple import torch diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index c9366ca97d1491d55297b2841b2b7f02f0c68f9e..3beba30832441deca56e61056e6c285ca37ec76b 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple import numpy diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index 245fe9238e42193f0eca15bb8395b3c32699f956..6120a8e66aef45227b37f5c6c5031800d832baa1 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional import torch diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py index 4a06c5d63d52dac103508cf8db2676cf9e73b555..fb557a31393caf90669aac94572b75858e810617 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utility functions used for tests and benchmarks""" from typing import List, Optional diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py index 17d09055b1eacce00fb82dc74a88d5a426a2af95..3654268e27af31ecf8b28f0f98d1126db33a7d42 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utility functions used for tests and benchmarks""" import random diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py index cb58eb945836393c58c53f5c6d702d53861c33f9..176b2947ab09e73a87217a167a6cc00a32d940b3 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import numpy diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 95e785dcc407874cfb4e240c40029f8ef93733cf..c7ce3a42c81f99ac8ec5d7cc645806a88e6c0b54 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -1,5 +1,7 @@ +# SPDX-License-Identifier: Apache-2.0 """This file is used for /tests and /benchmarks""" -from typing import List, Optional, Tuple +from types import MappingProxyType +from typing import List, Mapping, Optional, Tuple import numpy import torch @@ -11,14 +13,6 @@ from vllm.scalar_type import ScalarType, scalar_types SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128] SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] -# Note: this is a hack. We should update each model to register the -# stacked params and get it from there instead in a future PR. -# fused_name: List[shard_name] -FUSED_LAYER_NAME_MAPPING = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"] -} - # Normalize the group_shape to the full extent for any dims that are -1 def _normalize_quant_group_shape(x: torch.Tensor, group_shape: Tuple[int, @@ -177,14 +171,23 @@ def unpack_quantized_values_into_int32(w_q: torch.Tensor, return res.permute(inv_perm) -def is_layer_skipped(prefix: str, ignored_layers: List[str]) -> bool: +def is_layer_skipped( + prefix: str, + ignored_layers: List[str], + fused_mapping: Mapping[str, List[str]] = MappingProxyType({}) +) -> bool: # prefix: model.layers.0.self_attn.q_proj # proj_name: q_proj proj_name = prefix.split(".")[-1] - if proj_name in FUSED_LAYER_NAME_MAPPING: + + # Fused layers like gate_up_proj or qkv_proj will not be fused + # in the safetensors checkpoint. So, we convert the name + # from the fused version to unfused + check to make sure that + # each shard of the fused layer has the same scheme. + if proj_name in fused_mapping: shard_prefixes = [ prefix.replace(proj_name, shard_proj_name) - for shard_proj_name in FUSED_LAYER_NAME_MAPPING[proj_name] + for shard_proj_name in fused_mapping[proj_name] ] is_skipped = None diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 2f09e8a976b0f6693870939a328c15e88e86d32c..0e8166777fae61b93410f68f2bf635ae8bf530e9 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple, Union import torch @@ -42,6 +44,10 @@ def cutlass_block_fp8_supported() -> bool: return ops.cutlass_scaled_mm_supports_block_fp8(capability) +CUTLASS_FP8_SUPPORTED = cutlass_fp8_supported() +CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported() + + def per_tensor_dequantize( tensor: torch.Tensor, inv_scale: Union[float, torch.Tensor]) -> torch.Tensor: @@ -109,7 +115,7 @@ def apply_fp8_linear( input_scale: Optional[torch.Tensor] = None, input_scale_ub: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, - cutlass_fp8_supported: bool = True, + cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED, use_per_token_if_dynamic: bool = False, ) -> torch.Tensor: # ops.scaled_fp8_quant supports both dynamic and static quant. diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 9d6c3797c62fce36be607481fa6a3fbd9b7a3a2e..62e27b714866ad79221370fa3021988d1ee5ac0e 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import cached_property from importlib.util import find_spec from typing import Dict, Optional, Tuple diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index a67713c320b86e9411cbb9c1822d096a62e1ddb0..4c9860006c328d5d7a86d2221f5efab7973b5f89 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index d071cfe888f05a700bc8ffe348a235cc4981c666..b3b9b0e876057ceaecc6929b3634d27f418964bd 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -25,6 +27,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union import torch import torch.nn as nn +from transformers import PretrainedConfig from vllm.model_executor.custom_op import CustomOp @@ -770,8 +773,12 @@ class MRotaryEmbedding(RotaryEmbedding): dtype: torch.dtype, mrope_section: Optional[List[int]] = None, ) -> None: - super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style, dtype) + # In Qwen2.5-VL, the maximum index value is related to the duration of + # the input video. We enlarge max_position_embeddings to 4 times to get + # a larger the cos and sin cache. + self.cache_max_position_num = max_position_embeddings * 4 + super().__init__(head_size, rotary_dim, self.cache_max_position_num, + base, is_neox_style, dtype) self.mrope_section = mrope_section if self.mrope_section: @@ -829,13 +836,10 @@ class MRotaryEmbedding(RotaryEmbedding): @staticmethod def get_input_positions( input_tokens: List[int], + hf_config: PretrainedConfig, image_grid_thw: Union[List[List[int]], torch.Tensor], video_grid_thw: Union[List[List[int]], torch.Tensor], - image_token_id: int, - video_token_id: int, - vision_start_token_id: int, - vision_end_token_id: int, - spatial_merge_size: int, + second_per_grid_ts: Optional[List[float]] = None, context_len: int = 0, seq_len: Optional[int] = None, ) -> Tuple[List[List[int]], int]: @@ -843,16 +847,13 @@ class MRotaryEmbedding(RotaryEmbedding): llm_positions, mrope_position_delta = \ MRotaryEmbedding.get_input_positions_tensor( - input_tokens, - image_grid_thw, - video_grid_thw, - image_token_id, - video_token_id, - vision_start_token_id, - vision_end_token_id, - spatial_merge_size, - context_len, - seq_len, + input_tokens=input_tokens, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=context_len, + seq_len=seq_len, ) return llm_positions.tolist(), mrope_position_delta @@ -860,18 +861,22 @@ class MRotaryEmbedding(RotaryEmbedding): @staticmethod def get_input_positions_tensor( input_tokens: List[int], + hf_config: PretrainedConfig, image_grid_thw: Union[List[List[int]], torch.Tensor], video_grid_thw: Union[List[List[int]], torch.Tensor], - image_token_id: int, - video_token_id: int, - vision_start_token_id: int, - vision_end_token_id: int, - spatial_merge_size: int, + second_per_grid_ts: Optional[List[float]] = None, context_len: int = 0, seq_len: Optional[int] = None, ) -> Tuple[torch.Tensor, int]: """Get mrope input positions and delta value.""" + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + vision_start_token_id = hf_config.vision_start_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + tokens_per_second = getattr(hf_config.vision_config, + "tokens_per_second", 1.0) + if isinstance(image_grid_thw, torch.Tensor): image_grid_thw = image_grid_thw.tolist() if isinstance(video_grid_thw, torch.Tensor): @@ -890,6 +895,7 @@ class MRotaryEmbedding(RotaryEmbedding): image_index, video_index = 0, 0 for _ in range(image_nums + video_nums): + video_second_per_grid_t = 0.0 if image_token_id in input_tokens and remain_images > 0: ed_image = input_tokens.index(image_token_id, st) else: @@ -913,9 +919,13 @@ class MRotaryEmbedding(RotaryEmbedding): video_grid_thw[video_index][1], video_grid_thw[video_index][2], ) + video_second_per_grid_t = 1.0 + if second_per_grid_ts is not None: + video_second_per_grid_t = second_per_grid_ts[video_index] video_index += 1 remain_videos -= 1 ed = ed_video + llm_grid_t, llm_grid_h, llm_grid_w = \ t, h // spatial_merge_size, w // spatial_merge_size text_len = ed - st @@ -925,8 +935,10 @@ class MRotaryEmbedding(RotaryEmbedding): llm_pos_ids_list.append( torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) - t_index = torch.arange(llm_grid_t).view(-1, 1).expand( - -1, llm_grid_h * llm_grid_w).flatten() + t_index = (torch.arange(llm_grid_t).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t * + tokens_per_second).long().flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( llm_grid_t, -1, llm_grid_w).flatten() w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index e7593a6de22eb27461892ebde5e0809eebbad227..00f1b3b92a21d6fa9adc1456afc720dda4d70af2 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A layer that samples the next tokens from the model's outputs.""" import itertools import warnings diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py index 6aa4b8bd34cdebf3d31dfd1394bcca9f0960ddc3..35c7ffec271e509523e4c49b56afb74daf93958f 100644 --- a/vllm/model_executor/layers/spec_decode_base_sampler.py +++ b/vllm/model_executor/layers/spec_decode_base_sampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import abstractmethod from typing import Dict, Optional, Union diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py index 3206fd36a0001d426bb414913b0325ea8d2bde40..054d8c471272efb95642eb60dd232040eb8c3c03 100644 --- a/vllm/model_executor/layers/typical_acceptance_sampler.py +++ b/vllm/model_executor/layers/typical_acceptance_sampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Optional, List import torch diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index f6f34cd49d95301e403daff626c524f101947061..dfe71028c1bc7d7a534d020ab41fd1ceb3b70208 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utility methods for model layers.""" from typing import Tuple diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 2c2b6dfb81e0098adfa207dceb9473f3a023c27f..ee45eb178018ad277e496f62c445ddfbc57379c7 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import List, Optional, Sequence, Tuple diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index 12468997e46533e80d5c14238b0de7b9bc4bf067..9048c70c7a71435bfd8426ab8628d04acf5f3b40 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from torch import nn from vllm.config import VllmConfig diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 16f9e456dfa40b766256a918e3f676659b8c0a5f..2978cb97b9b7c28d83d2c8c657fc079bf8c58863 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa: SIM117 import collections import copy @@ -41,6 +43,7 @@ from vllm.model_executor.model_loader.tensorizer import ( TensorizerConfig, is_vllm_tensorized, load_with_tensorizer, serialize_vllm_model, tensorizer_weights_iterator) from vllm.model_executor.model_loader.utils import (ParamMapping, + configure_quant_config, get_model_architecture, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( @@ -111,6 +114,9 @@ def _initialize_model( model_config = vllm_config.model_config model_class, _ = get_model_architecture(model_config) + if vllm_config.quant_config is not None: + configure_quant_config(vllm_config.quant_config, model_class) + signatures = inspect.signature(model_class.__init__) all_params = [param.name for param in signatures.parameters.values()] if "vllm_config" in all_params and "prefix" in all_params: @@ -801,9 +807,11 @@ class BitsAndBytesModelLoader(BaseModelLoader): iterator = safetensors_weights_iterator(hf_weights_files) else: iterator = pt_weights_iterator(hf_weights_files) - for name, param in iterator: - # mapping weight names from transformers to vllm. - yield self.weight_mapper(name), param + for org_name, param in iterator: + # mapping weight names from transformers to vllm while preserving + # original names. + mapped_name = self.weight_mapper(org_name) + yield org_name, mapped_name, param def _get_quantized_weights_iterator( self, @@ -864,24 +872,30 @@ class BitsAndBytesModelLoader(BaseModelLoader): def _quantized_8bit_generator(self, hf_weights_files, use_safetensors, quant_state_dict) -> Generator: - for weight_name, weight_tensor in self._hf_weight_iter( - hf_weights_files, use_safetensors): - if not weight_name.lower().endswith(".scb"): + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + if not mapped_weight_name.lower().endswith(".scb"): continue - weight_key = weight_name.lower().replace(".scb", ".weight") + weight_key = mapped_weight_name.lower().replace(".scb", ".weight") quant_state_dict[weight_key] = weight_tensor - for weight_name, weight_tensor in self._hf_weight_iter( - hf_weights_files, use_safetensors): - if self._is_8bit_weight_name(weight_name): + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + if self._is_8bit_weight_name(mapped_weight_name): continue - if weight_name in quant_state_dict: + if mapped_weight_name in quant_state_dict: set_weight_attrs(weight_tensor, {"load_in_8bit": True}) - yield weight_name, weight_tensor + yield org_weight_name, weight_tensor else: - yield weight_name, weight_tensor + yield org_weight_name, weight_tensor def _quantized_4bit_generator(self, hf_weights_files, use_safetensors, quant_state_dict) -> Generator: @@ -891,15 +905,19 @@ class BitsAndBytesModelLoader(BaseModelLoader): weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors) temp_state_dict = {} - for weight_name, weight_tensor in weight_iterator: - if not self._is_4bit_weight_name(weight_name): + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in weight_iterator: + if not self._is_4bit_weight_name(mapped_weight_name): continue # bitsandbytes library requires # weight.quant_state.bitsandbytes__* in CPU - if "quant_state.bitsandbytes" in weight_name: - temp_state_dict[weight_name] = weight_tensor.cpu().data + if "quant_state.bitsandbytes" in mapped_weight_name: + temp_state_dict[mapped_weight_name] = weight_tensor.cpu().data else: - temp_state_dict[weight_name] = weight_tensor + temp_state_dict[mapped_weight_name] = weight_tensor # Closure to parse quant_state for each prequant weight def _parse_quant_state(param_name: str, @@ -913,20 +931,24 @@ class BitsAndBytesModelLoader(BaseModelLoader): # Second iterate over all prequant and normal weights # pre quantized weights would have a quant_state - for weight_name, weight_tensor in self._hf_weight_iter( - hf_weights_files, use_safetensors): - if self._is_4bit_weight_name(weight_name): + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + if self._is_4bit_weight_name(mapped_weight_name): continue - if (f"{weight_name}.quant_state.bitsandbytes__nf4" + if (f"{mapped_weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict) or ( - f"{weight_name}.quant_state.bitsandbytes__fp4" + f"{mapped_weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict): - quant_state = _parse_quant_state(weight_name, temp_state_dict) - quant_state_dict[weight_name] = quant_state - yield weight_name, weight_tensor + quant_state = _parse_quant_state(mapped_weight_name, + temp_state_dict) + quant_state_dict[mapped_weight_name] = quant_state + yield org_weight_name, weight_tensor else: - yield weight_name, weight_tensor + yield org_weight_name, weight_tensor def _unquantized_generator(self, hf_weights_files, use_safetensors, quant_state_dict) -> Generator: @@ -935,18 +957,22 @@ class BitsAndBytesModelLoader(BaseModelLoader): tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() - for weight_name, weight_tensor in self._hf_weight_iter( - hf_weights_files, use_safetensors): - if any(target_module in weight_name for target_module in - self.target_modules) and weight_name.endswith(".weight"): + for ( + org_weight_name, + mapped_weight_name, + weight_tensor, + ) in self._hf_weight_iter(hf_weights_files, use_safetensors): + if any(target_module in mapped_weight_name + for target_module in self.target_modules + ) and mapped_weight_name.endswith(".weight"): # Without sharding if any( - weight_name.startswith(module) + mapped_weight_name.startswith(module) for module in self.unsharded_weights_modules): weight_sub_tensor = weight_tensor # Shard by column elif any( - weight_name.startswith(module) + mapped_weight_name.startswith(module) for module in self.column_sharded_weights_modules): total_size = weight_tensor.size(-1) start_index = total_size // tp_size * tp_rank @@ -956,14 +982,14 @@ class BitsAndBytesModelLoader(BaseModelLoader): # Weights have fused on disk. In this case, we assume that the # weight and module use same name. elif any( - weight_name.startswith(module) + mapped_weight_name.startswith(module) for module in self.maybe_fused_weights_modules): # special case for fused weights # get the size of each shard weight tensor total_shard_sizes = next( (sizes for module, sizes in self.maybe_fused_weights_modules.items() - if weight_name.startswith(module))) + if mapped_weight_name.startswith(module))) total_size = weight_tensor.size(0) assert total_size == sum(total_shard_sizes) # get the start/end index of each shard weight tensor @@ -1006,23 +1032,21 @@ class BitsAndBytesModelLoader(BaseModelLoader): quant_type="nf4", ) - quant_state_dict[weight_name] = quant_state + quant_state_dict[mapped_weight_name] = quant_state else: processed_weight = weight_tensor - - yield weight_name, processed_weight + yield org_weight_name, processed_weight def _get_bnb_target_modules(self, model: nn.Module) -> None: for name, module in model.named_modules(): if isinstance(module, (LinearBase, )): - last_name = name.split(".")[-1] - if sub_modules := self.modules_mapping.packed_mapping.get( - last_name, []): + if modules_info := self.modules_mapping.get_sub_modules(name): # Map vllm's names to transformers's names. + rep_name, sub_modules = modules_info for sub_name in sub_modules: self.target_modules.append( - name.replace(last_name, sub_name)) + name.replace(rep_name, sub_name)) # Add original module name even if the module has stacked map, # in case model has a mixture of disk-merged and disk-splitted # weights with same last name. diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index a90fbd648def9d41f79910f648707b1b194a4f0b..d900fb3a7d3973a18b43b481fb6f199d4bc3c6e3 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utilities for selecting and loading neuron models.""" import copy import importlib diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index e6299295c85a241f07e9100e5224c8b489fb2024..fde200d576e2fda2c61b20e0305fb2616d88fc38 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa: SIM117 from pathlib import Path from typing import List, Optional, Tuple @@ -11,7 +13,7 @@ from torch import nn import vllm.envs as envs from vllm.attention.backends.openvino import OpenVINOAttentionMetadata -from vllm.config import DeviceConfig, ModelConfig +from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import (LogitsProcessor, _prune_hidden_states) @@ -101,7 +103,6 @@ class OpenVINOCausalLM(nn.Module): self, ov_core: ov.Core, model_config: ModelConfig, - device_config: DeviceConfig, kv_cache_dtype: ov.Type, ) -> None: super().__init__() @@ -185,8 +186,7 @@ class OpenVINOCausalLM(nn.Module): def get_model( - model_config: ModelConfig, - device_config: DeviceConfig, + vllm_config: VllmConfig, kv_cache_dtype: ov.Type, **kwargs, ) -> torch.nn.Module: @@ -199,5 +199,6 @@ def get_model( "be added in the future. If this is important to you, " "please open an issue on github.") - return OpenVINOCausalLM(ov_core, model_config, device_config, - kv_cache_dtype) + with set_current_vllm_config(vllm_config): + return OpenVINOCausalLM(ov_core, vllm_config.model_config, + kv_cache_dtype) diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 9266ca75ddaacb4533708c00312a95f1e09017df..117251ccf05f183f75719eb5d1ec604b3f76c3f9 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import dataclasses import io diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index baf16d6703f4360ecad4ebbd7947fdac86b9e65c..e0fe215a2379e6e2a44517b11f8ea175d0d23777 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -1,18 +1,26 @@ +# SPDX-License-Identifier: Apache-2.0 """Utilities for selecting and loading models.""" import contextlib from dataclasses import dataclass, field -from typing import Dict, List, Tuple, Type +from typing import Dict, List, Optional, Tuple, Type import os import torch +import transformers from torch import nn +from transformers.dynamic_module_utils import get_class_from_dynamic_module -from vllm.config import ModelConfig +from vllm.config import ModelConfig, ModelImpl +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models.adapters import (as_classification_model, as_embedding_model, as_reward_model) +logger = init_logger(__name__) + @contextlib.contextmanager def set_default_torch_dtype(dtype: torch.dtype): @@ -23,6 +31,50 @@ def set_default_torch_dtype(dtype: torch.dtype): torch.set_default_dtype(old_dtype) +def is_transformers_impl_compatible( + arch: str, + module: Optional[transformers.PreTrainedModel] = None) -> bool: + mod = module or getattr(transformers, arch, None) + if mod is None: + return False + if hasattr(mod, "supports_backend"): + return mod.is_backend_compatible() + else: + return mod._supports_flex_attn + + +def resolve_transformers_fallback(model_config: ModelConfig, + architectures: list[str]): + for i, arch in enumerate(architectures): + if arch == "TransformersModel": + continue + custom_module = None + auto_map = getattr(model_config.hf_config, "auto_map", None) + if auto_map is not None and "AutoModel" in auto_map: + custom_module = get_class_from_dynamic_module( + model_config.hf_config.auto_map["AutoModel"], + model_config.model) + # TODO(Isotr0py): Further clean up these raises. + # perhaps handled them in _ModelRegistry._raise_for_unsupported? + if model_config.model_impl == ModelImpl.TRANSFORMERS: + if not is_transformers_impl_compatible(arch, custom_module): + raise ValueError( + f"The Transformers implementation of {arch} is not " + "compatible with vLLM.") + architectures[i] = "TransformersModel" + if model_config.model_impl == ModelImpl.AUTO: + if not is_transformers_impl_compatible(arch, custom_module): + raise ValueError( + f"{arch} has no vLLM implementation and the Transformers " + "implementation is not compatible with vLLM.") + logger.warning( + "%s has no vLLM implementation, falling back to Transformers " + "implementation. Some features may not be supported and " + "performance may not be optimal.", arch) + architectures[i] = "TransformersModel" + return architectures + + def get_model_architecture( model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: architectures = getattr(model_config.hf_config, "architectures", []) @@ -72,6 +124,14 @@ def get_model_architecture( and "MixtralForCausalLM" in architectures): architectures = ["QuantMixtralForCausalLM"] + vllm_supported_archs = ModelRegistry.get_supported_archs() + is_vllm_supported = any(arch in vllm_supported_archs + for arch in architectures) + if (not is_vllm_supported + or model_config.model_impl == ModelImpl.TRANSFORMERS): + architectures = resolve_transformers_fallback(model_config, + architectures) + model_cls, arch = ModelRegistry.resolve_model_cls(architectures) if model_config.task == "embed": model_cls = as_embedding_model(model_cls) @@ -108,3 +168,30 @@ class ParamMapping: packed_name, index, ) + + def get_sub_modules(self, + module_name: str) -> Optional[Tuple[str, List[str]]]: + for key, value in self.packed_mapping.items(): + if module_name.endswith(key): + return key, value + return None + + +def configure_quant_config(quant_config: QuantizationConfig, + model_class: Type[nn.Module]): + """ + Pass packed_modules_mapping by reference to quant_config so that + quant_config can properly match fused modules + + Note that model attributes are passed by reference to quant_config, + enabling them to be updated by model_class.__new__ (ex. chatglm, qwen) + """ + packed_mapping = getattr(model_class, "packed_modules_mapping", None) + if packed_mapping is not None: + # pass packed_modules_mapping by reference to quant_config + quant_config.packed_modules_mapping = packed_mapping + else: + logger.warning( + "The model class %s has not defined `packed_modules_mapping`, " + "this may lead to incorrect mapping of quantized or ignored " + "modules", model_class.__name__) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index e4d103f7cab99441446fb0b4bac8679c819d9ac0..cade0a1dd595020a522361eba9eb58c3e43cdc56 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utilities for downloading and initializing model weights.""" import fnmatch import glob diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index afd4171f9cf4b54f475c368c352ff5ddf858bf5e..85bc99b87d6a84128260438da3f5c9de263fa65e 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal, SupportsPP, has_inner_state, supports_lora, supports_multimodal, supports_pp) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 55e90b9d419509785eb80c75fe95da19c9b68a8b..3e1daa773fc83d032536056386a774ac0f0277b7 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Optional, TypeVar diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index fd6b5659df5d166df404039b41dcda84da555f68..d015682aab47fb75965b717d83433944c8dc3bd9 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Inference-only Snowflake Arctic model.""" from typing import Iterable, List, Optional, Set, Tuple, Union diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 8c6873de13627969513382773d6e28a6be6fa9b8..98df532aa0a8325bb38d6603d363e70474f0dfc3 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -397,7 +399,11 @@ class AriaProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return {"image": self.get_num_image_tokens()} def get_num_image_tokens(self) -> int: diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 6f50345b08673d80782d18925ba0497d5e71f5af..8cdbab2474a879283e4eeec8f12f8b56c7b238b9 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 57eb5adc82d5b3457d10542d617cadb3a238e91c..204c48d0d8969d70468e1418232d7d4f2fbf45d2 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Derived from BART implementation posted on HuggingFace; license below: # # coding=utf-8 diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 4be136543de15784223c03a4cdcd84d3e6fa46ef..4d0f5ac8ea5df5b215a490f424ef1652d4c26291 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Iterable, List, Optional, Set, Tuple import torch diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 987dfaf44f22898039fe9b018c3feb39f605046c..bedbdceb7721d515da125c8a3db6aa82ae619fb4 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Minimal implementation of BlipVisionModel intended to be only used within a vision language model.""" from typing import Iterable, Optional, Set, Tuple, Union diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index b559ac677a740e68643becf8b8390062950d7344..0463a0b97d40aa7242d0675f6c3f1c52b1244192 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -405,7 +407,11 @@ class Blip2ProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return {"image": self.get_num_image_tokens()} def get_num_image_tokens(self) -> int: diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 13ffebe9e736674c130ab71b20d23599ea33e4c9..ee459b80ec6e8bc7c378da30ae0bddddd699e6e0 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index e834c9004f140370e0d6cf1172379922b29e0e6a..b29dd65a8e3576ac2ed3539c7178d046b449967f 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import cached_property from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -62,7 +64,11 @@ class ChameleonProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return {"image": self.get_num_image_tokens()} def get_num_image_tokens(self) -> int: diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index de79a08d376cd81ef5da8188d7e0b05c94439fe9..dfa5a4fd9b49ec46f6f4d4ea76cca043b6fa53a0 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/THUDM/CogAgent """Inference-only CogAgent model compatible with THUDM weights.""" @@ -273,12 +275,14 @@ class GLMAttention(nn.Module): self.total_num_kv_heads, bias=config.add_bias_linear or config.add_qkv_bias, quant_config=quant_config, + prefix=f"{prefix}.query_key_value", ) self.dense = RowParallelLinear( self.total_num_heads * self.head_dim, config.hidden_size, bias=config.add_bias_linear, quant_config=quant_config, + prefix=f"{prefix}.dense", ) # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 @@ -342,6 +346,7 @@ class GLMMLP(nn.Module): self, config: ChatGLMConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() @@ -353,6 +358,7 @@ class GLMMLP(nn.Module): [config.ffn_hidden_size] * 2, bias=config.add_bias_linear, quant_config=quant_config, + prefix=f"{prefix}.dense_h_to_4h", ) self.activation_func = SiluAndMul() @@ -363,6 +369,7 @@ class GLMMLP(nn.Module): config.hidden_size, bias=config.add_bias_linear, quant_config=quant_config, + prefix=f"{prefix}.dense_4h_to_h", ) def forward(self, hidden_states): @@ -411,7 +418,7 @@ class GLMBlock(nn.Module): config.hidden_size, eps=config.layernorm_epsilon) # MLP - self.mlp = GLMMLP(config, quant_config) + self.mlp = GLMMLP(config, quant_config, prefix=f"{prefix}.mlp") def forward( self, @@ -522,7 +529,8 @@ class ChatGLMModel(nn.Module): self.embedding = VocabParallelEmbedding(config.padded_vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.embedding") self.num_layers = config.num_layers self.multi_query_group_num = config.multi_query_group_num @@ -834,6 +842,7 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, SupportsMultiModal): # Ensure that the LoRA support check passes when the class is not # initialized, but set all these attributes to empty. + # These will be updated when an instance class is selected packed_modules_mapping = {} supported_lora_modules = [] embedding_modules = {} @@ -845,9 +854,18 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, prefix: str = "", ) -> None: config = vllm_config.model_config.hf_config + # Initialize VL - if hasattr(config, "vision_config"): - return ChatGLMV(vllm_config=vllm_config, prefix=prefix) + if hasattr(config, "vision_config"): # noqa: SIM108 + instance_cls = ChatGLMV # Initialize LLM else: - return ChatGLM(vllm_config=vllm_config, prefix=prefix) \ No newline at end of file + instance_cls = ChatGLM + + # quant_config references base class members, + # so update values before init is called + cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping) + cls.supported_lora_modules += instance_cls.supported_lora_modules + cls.embedding_modules.update(instance_cls.embedding_modules) + cls.embedding_padding_modules += instance_cls.embedding_padding_modules + return instance_cls(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index b94ede8e4babb12cf23f3861cb1fcd2a0920a82c..97706fc39c8fc92f860f41521a401ae628faa16e 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Minimal implementation of CLIPVisionModel intended to be only used within a vision language model.""" from typing import Iterable, List, Optional, Set, Tuple, Union diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 989056bf5c155e0516ab47ddb2379929604aab21..e73627da05d40684c08af1ad0c9b91ae344995ba 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index b2aa3c0709bd49e0e50dc76628efb33dfabd1ef4..bb3f4f40dd21140eb1fa2f24aff9750f965e2cea 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Iterable, List, Optional, Set, Tuple, Union import torch diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py index c551853956b92b1ed8eb8fe955e01a14c3929226..b239b642f752b6ab8d2608636da5b5562c71d4ba 100644 --- a/vllm/model_executor/models/decilm.py +++ b/vllm/model_executor/models/decilm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 DeciAI Research Team. All rights reserved. diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 74b6bfdf21909c178e03a255eb71cf2942c99033..9599e1df6a3cb4a3ae982d397344b2c8125fb1e8 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 73388cd26985361f009d8e84e3b5334e24a222fb..773f5abe71daedf53eae3b81bf02a6b45ffb1788 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -19,7 +21,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Inference-only DeepseekV2 model.""" +"""Inference-only DeepseekV2/DeepseekV3 model.""" from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch @@ -113,23 +115,32 @@ class DeepseekV2MoE(nn.Module): raise ValueError(f"Unsupported activation: {config.hidden_act}. " "Only silu is supported for now.") - self.experts = FusedMoE(num_experts=config.n_routed_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=config.norm_topk_prob, - quant_config=quant_config, - use_grouped_topk=True, - num_expert_group=config.n_group, - topk_group=config.topk_group, - prefix=f"{prefix}.experts") - self.gate = ReplicatedLinear(config.hidden_size, config.n_routed_experts, bias=False, quant_config=None, prefix=f"{prefix}.gate") + if config.topk_method == "noaux_tc": + self.gate.e_score_correction_bias = nn.Parameter( + torch.empty(config.n_routed_experts)) + else: + self.gate.e_score_correction_bias = None + + self.experts = FusedMoE( + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func=config.scoring_func, + e_score_correction_bias=self.gate.e_score_correction_bias) + if config.n_shared_experts is not None: intermediate_size = (config.moe_intermediate_size * config.n_shared_experts) @@ -412,7 +423,8 @@ class DeepseekV2MLAAttention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.o_proj") - rope_scaling["rope_type"] = 'deepseek_yarn' + if rope_scaling: + rope_scaling["rope_type"] = 'deepseek_yarn' self.rotary_emb = get_rope(qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, @@ -729,6 +741,15 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue + + # TODO(simon): support nextn predict layers + if hasattr(self.config, "num_nextn_predict_layers" + ) and self.config.num_nextn_predict_layers > 0: + assert self.config.num_nextn_predict_layers == 1 + layer_idx = self.config.num_hidden_layers + if name.startswith(f"model.layers.{layer_idx}"): + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). if weight_name not in name: @@ -790,3 +811,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params + + +class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM): + pass diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 344832d8b33e6f390ff458f4ce1f540dbcc4613c..0eaf3a6201f6b32504d15f0d93e472c98e9dbf8d 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" import math @@ -163,7 +165,11 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo): image_width=x[1], image_height=x[0])) return ImageSize(width=width, height=height) - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: max_image_size = self.get_image_size_with_most_features() max_image_tokens = self.get_num_image_tokens( image_height=max_image_size.height, diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 948560b4906b8a45d87882805bc4be8fe445c264..373a728be89cb45e6a7c6ae0b49141cfd71b0335 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Iterable, List, Optional, Tuple import torch diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index bc3295da7b60abeb9344b71863fe8fe5259092fa..2eb91a682242c04629806719929374ae8beb818f 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py # Copyright 2024 The LG U+ CTO AI Tech Lab. diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py index b93a68680375d5cff8ff5d956da1adc89fb943d1..310aca999bc2d382480f92c566eb6609f51292d5 100644 --- a/vllm/model_executor/models/fairseq2_llama.py +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 The vLLM team. # Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved. # diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 783e90d9d098f4a0de7faa8016a4411e9f1661ee..a79e4ba1ef6efc1edecfd04670a3bcd460747b42 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 3a5fe8e1f4144abc1f322fac8fe56be82a9a45cf..4a1ad5f4ee0cee69e65a5b84c529ea10a13e4ce0 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from typing import Iterable, List, Optional, Set, Tuple diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index dbf9da50cc9dedda59a77f1a734ed924fb8ec65f..50b5ef35d2cd122c54aac7778bf1344dc8fffe3d 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py # Copyright 2023 The vLLM team. # Copyright 2023 HuggingFace Inc. team. All rights reserved. @@ -78,7 +80,11 @@ class FuyuProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: target_width, target_height = self.get_image_size_with_most_features() max_ncols, max_nrows = self.get_image_feature_grid_size( diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index b23aba829c54963db8f46f322b7be12ee5fcd23f..cb81aa41e25420f88678b8b411cd16f68b53ac9f 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2023 The vLLM team. # Copyright (c) Google Inc. # diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index f0dc7693974beaa3de23de155031b68c2cd15dc5..a6dc8f84772b47a6716d907039f63a15a9186901 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 The vLLM team. # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py index 942d1e14baed18bb0b018c9c3655bc32262f6d6d..5f1903345f0d72ccad2d59b73c7ae2b22300c88d 100644 --- a/vllm/model_executor/models/glm.py +++ b/vllm/model_executor/models/glm.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Inference-only HF format GLM-4 model compatible with THUDM weights.""" from vllm.config import VllmConfig from vllm.model_executor.models.llama import LlamaForCausalLM diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index f605c519c790f05bb8c777a52c144675bbfe79f4..864aba1f4703dffc2a2d52cd40998522fac7eaaf 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/THUDM/GLM-4 """Inference-only GLM-4v model visual encoder compatible with THUDM weights.""" @@ -72,11 +74,13 @@ class Attention(nn.Module): self.head_dim, config.num_heads, quant_config=quant_config, + prefix=f"{prefix}.query_key_value", ) self.dense = RowParallelLinear( config.hidden_size, config.hidden_size, quant_config=quant_config, + prefix=f"{prefix}.dense", ) self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim, @@ -99,6 +103,7 @@ class MLP(nn.Module): self, config, quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', ): super().__init__() self.config = config @@ -107,11 +112,13 @@ class MLP(nn.Module): config.hidden_size, config.intermediate_size, quant_config=quant_config, + prefix=f"{prefix}.fc1", ) self.fc2 = RowParallelLinear( config.intermediate_size, config.hidden_size, quant_config=quant_config, + prefix=f"{prefix}.fc2", ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -135,7 +142,9 @@ class TransformerLayer(nn.Module): self.attention = Attention(config, quant_config=quant_config, prefix=f"{prefix}.attention") - self.mlp = MLP(config, quant_config=quant_config) + self.mlp = MLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.post_attention_layernorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -162,7 +171,7 @@ class Transformer(nn.Module): self.layers = nn.ModuleList([ TransformerLayer(config, quant_config=quant_config, - prefix=f"{prefix}.layer.{layer_idx}") + prefix=f"{prefix}.layers.{layer_idx}") for layer_idx in range(config.num_hidden_layers) ]) @@ -179,6 +188,7 @@ class GLU(nn.Module): config, in_features, quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', ): """ The original implementation is the same as: @@ -220,7 +230,8 @@ class GLU(nn.Module): self.linear_proj = ReplicatedLinear(in_features, config.hidden_size, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.linear_proj") self.norm1 = nn.LayerNorm(config.hidden_size) self.act1 = nn.GELU() self.act2 = SiluAndMul() @@ -228,12 +239,15 @@ class GLU(nn.Module): self.merged_proj = MergedColumnParallelLinear( config.hidden_size, [config.ffn_hidden_size] * 2, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.merged_proj") - self.dense_4h_to_h = RowParallelLinear(config.ffn_hidden_size, - config.hidden_size, - bias=False, - quant_config=quant_config) + self.dense_4h_to_h = RowParallelLinear( + config.ffn_hidden_size, + config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.dense_4h_to_h") def forward(self, x): x, _ = self.linear_proj(x) @@ -260,7 +274,8 @@ class EVA2CLIPModel(nn.Module): prefix=f"{prefix}.transformer") self.linear_proj = GLU(config, in_features=config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.linear_proj") self.conv = nn.Conv2d(in_channels=vision_config.hidden_size, out_channels=config.hidden_size, kernel_size=2, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 2f1aa2d68653c6783eb8d4a2952b34cc194e4558..7ad9a24dcbbcc79b53d1f52ac31cc8f21b500d2f 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index c64bc706888063f43daced21dc3a2144bf2bdec1..887a444748ae2295aebe78786d0df7c5d56e8dde 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 08298cc0db36f96df2e8bbcfddf47990a514f53b..815aba145d30306fd6d61a6a6a5e20adab399dc6 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 731642772011c079a77fdb93062981ac27b416db..550ca3f7ca9e24bb96712f462dbb0ba63a380c58 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 543b4e2f5e28627339f40d506415151c75e93ea6..85911a0f41c2f2c40583e60e709fd684e69eb94a 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index cdf9414d5949c04b5a1b0d748affc72e618fc7d7..8ae661bf15c49c78420cb81eead121fcd05e888b 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index d179d6235424aa78b9eefd23ab94fac67a201c80..7bda54ea7689bb23dac33bbbf5c0495f6c6efe05 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from array import array from typing import List, Optional, Union diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index df7e768fe14d3f087cf4ea9072b341c65381ae30..cf3e777a2027f5dfe01e136fffa070c7469e8eec 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py # -------------------------------------------------------- @@ -5,43 +7,55 @@ # Copyright (c) 2024 H2O.AI # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -from functools import partial -from typing import List, Optional, Tuple +from typing import Mapping, Optional import torch from PIL import Image from transformers import PretrainedConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.utils import cached_get_tokenizer -from vllm.utils import is_list_of +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalKwargs +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + MultiModalDataItems) +from vllm.multimodal.processing import (ProcessingCache, PromptReplacement, + PromptReplacementDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.transformers_utils.tokenizer import AnyTokenizer from .intern_vit import InternVisionModel -from .internvl import (IMG_CONTEXT, IMG_END, IMG_START, InternVLChatModel, - InternVLInputPipeline, build_transform, - find_closest_aspect_ratio, get_internvl_num_patches) +from .internvl import (IMG_CONTEXT, IMG_END, IMG_START, + BaseInternVLProcessingInfo, BaseInternVLProcessor, + InternVLChatModel, InternVLDummyInputsBuilder, + InternVLMultiModalProcessor, build_transform, + find_closest_aspect_ratio, get_internvl_target_ratios) +logger = init_logger(__name__) -# modified to include blocks generated in second pass -def calculate_num_blocks( - orig_width: int, - orig_height: int, - min_num: int, - max_num: int, - image_size: int, + +def resolve_h2ovl_min_max_num( + *, + min_dynamic_patch: int, + max_dynamic_patch: int, + dynamic_image_size: bool, use_thumbnail: bool, - prior_aspect_ratio=None, -) -> Tuple[int, int, int, Tuple[int, int]]: - aspect_ratio = orig_width / orig_height +) -> tuple[int, int]: + max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 + + if use_thumbnail and max_dynamic_patch != 1: + max_dynamic_patch += 1 + + return min_dynamic_patch, max_dynamic_patch - # calculate the existing image aspect ratio - target_ratios = set((i, j) for n in range(min_num, max_num + 1) - for i in range(1, n + 1) for j in range(1, n + 1) - if i * j <= max_num and i * j >= min_num) - target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + +def get_h2ovl_target_ratios( + min_num: int, + max_num: int, + *, + prior_aspect_ratio: Optional[tuple[int, int]], +) -> list[tuple[int, int]]: + target_ratios = get_internvl_target_ratios(min_num, max_num) # if prior_aspect_ratio is provided, filter the target ratios if prior_aspect_ratio is not None: @@ -50,44 +64,66 @@ def calculate_num_blocks( ratio[0] != 0 and prior_aspect_ratio[1] % ratio[1] != 0 ] + return target_ratios + + +# modified to include blocks generated in second pass +def calculate_h2ovl_targets( + *, + orig_width: int, + orig_height: int, + target_ratios: list[tuple[int, int]], + image_size: int, + use_thumbnail: bool, +) -> tuple[int, int, int, tuple[int, int]]: + aspect_ratio = orig_width / orig_height + # find the closest aspect ratio to the target - target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, - target_ratios, orig_width, - orig_height, image_size) + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, + target_ratios, + width=orig_width, + height=orig_height, + image_size=image_size, + ) # calculate the target width and height target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] blocks = target_aspect_ratio[0] * target_aspect_ratio[1] - # add thumbnail image if num_blocks > 1 - if use_thumbnail and blocks > 1: + + # add thumbnail image if num_blocks != 1 + if use_thumbnail and blocks != 1: blocks += 1 + return blocks, target_width, target_height, target_aspect_ratio # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -# refactored to handle prior_aspect_ratio as optional -def dynamic_preprocess( +# refactored to handle prior_aspect_ratio +def dynamic_preprocess_h2ovl( image: Image.Image, - min_num: int, - max_num: int, + *, + target_ratios: list[tuple[int, int]], image_size: int, use_thumbnail: bool, - prior_aspect_ratio: Optional[Tuple[int, int]] = None, -) -> Tuple[List[Image.Image], Tuple[int, int]]: +) -> tuple[list[Image.Image], tuple[int, int]]: orig_width, orig_height = image.size - # calculate the number of blocks based on prior aspect ratio if available - blocks, target_width, target_height, target_aspect_ratio = ( - calculate_num_blocks( - orig_width, - orig_height, - min_num, - max_num, - image_size, - use_thumbnail=False, - prior_aspect_ratio=prior_aspect_ratio, - )) + # calculate the number of blocks without thumbnail + ( + blocks, + target_width, + target_height, + target_aspect_ratio, + ) = calculate_h2ovl_targets( + orig_width=orig_width, + orig_height=orig_height, + target_ratios=target_ratios, + image_size=image_size, + use_thumbnail=False, + ) + # resize the image resized_img = image.resize((target_width, target_height)) processed_images = [] @@ -101,276 +137,393 @@ def dynamic_preprocess( # split the image split_img = resized_img.crop(box) processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: thumbnail_img = image.resize((image_size, image_size)) processed_images.append(thumbnail_img) + return processed_images, target_aspect_ratio -def load_image( +def _preprocess_image( image: Image.Image, - input_size=448, - min_num=1, - max_num=6, - use_thumbnail=True, - prior_aspect_ratio: Optional[Tuple[int, int]] = None, -) -> Tuple[torch.Tensor, Tuple[int, int]]: + *, + input_size: int, + min_num: int, + max_num: int, + use_thumbnail: bool, + prior_aspect_ratio: Optional[tuple[int, int]], +) -> tuple[torch.Tensor, tuple[int, int]]: + target_ratios = get_h2ovl_target_ratios( + min_num, + max_num, + prior_aspect_ratio=prior_aspect_ratio, + ) + transform = build_transform(input_size=input_size) - images, target_aspect_ratio = dynamic_preprocess( + images, target_aspect_ratio = dynamic_preprocess_h2ovl( image, image_size=input_size, use_thumbnail=use_thumbnail, - min_num=min_num, - max_num=max_num, - prior_aspect_ratio=prior_aspect_ratio, + target_ratios=target_ratios, ) - pixel_values = [transform(image) for image in images] - pixel_values = torch.stack(pixel_values) + + pixel_values = torch.stack([transform(image) for image in images]) return pixel_values, target_aspect_ratio -# refactored to use the combined load_image function -def image_to_pixel_values( +# refactored to use the _preprocess_image function +def image_to_pixel_values_h2ovl( image: Image.Image, + *, input_size: int, min_num: int, max_num: int, use_thumbnail: bool, - use_MSAC: bool, + use_msac: bool, ) -> torch.Tensor: # when MSAC is turned on, we need to process the image twice - if use_MSAC: + if use_msac: # first pass - pixel_values, target_aspect_ratio = load_image( + pixel_values1, aspect_ratio1 = _preprocess_image( image, input_size=input_size, min_num=min_num, max_num=max_num, use_thumbnail=True, + prior_aspect_ratio=None, ) # second pass - pixel_values2, _ = load_image( + pixel_values2, _ = _preprocess_image( image, input_size=input_size, - min_num=min_num, + min_num=3, # Hardcoded value max_num=max_num, - prior_aspect_ratio=target_aspect_ratio, + use_thumbnail=True, + prior_aspect_ratio=aspect_ratio1, ) # combine pixel values pixel_values = torch.cat( - [pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0) + [pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0) else: - pixel_values, _ = load_image( + pixel_values, _ = _preprocess_image( image, input_size=input_size, min_num=min_num, max_num=max_num, use_thumbnail=use_thumbnail, + prior_aspect_ratio=None, ) return pixel_values -def image_to_pixel_values_wrapper(hf_config: PretrainedConfig, - max_dynamic_patch: Optional[int] = None, - use_MSAC: Optional[bool] = None): - image_size = hf_config.vision_config.image_size - min_num = hf_config.min_dynamic_patch - if max_dynamic_patch is None: - max_dynamic_patch = hf_config.max_dynamic_patch - if use_MSAC is None: - use_MSAC = hf_config.use_msac - use_thumbnail = hf_config.use_thumbnail - return partial( - image_to_pixel_values, - input_size=image_size, - min_num=min_num, - max_num=max_dynamic_patch, - use_thumbnail=use_thumbnail, - use_MSAC=use_MSAC, - ) - +class H2OVLProcessor(BaseInternVLProcessor): -def get_max_internvl_image_tokens(ctx: InputContext, - *, - max_dynamic_patch: Optional[int] = None): - """ - Calculate the maximum number of tokens with/without MSAC and thumbnail - """ - hf_config = ctx.get_hf_config() - use_thumbnail = hf_config.use_thumbnail - use_MSAC = hf_config.use_msac - - if max_dynamic_patch is None: - max_dynamic_patch = hf_config.max_dynamic_patch + def __init__( + self, + config: PretrainedConfig, + tokenizer: AnyTokenizer, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + use_msac: Optional[bool] = None, + ) -> None: + super().__init__( + config, + tokenizer, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) - num_patches = get_internvl_num_patches(hf_config) + if use_msac is None: + use_msac = config.use_msac + assert isinstance(use_msac, bool) - coefficient = 2 if use_MSAC else 1 - num_blocks = coefficient * max_dynamic_patch + (1 if use_thumbnail else 0) + self.use_msac = use_msac - return num_blocks * num_patches + @property + def image_token_id(self) -> int: + return self.tokenizer.get_vocab()[IMG_CONTEXT] + def get_image_repl_features( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + return IMG_CONTEXT * feature_size -class H2OVLInputPipeline(InternVLInputPipeline): - """ - Input pipeline for processing image and text data for the H2OVL model. - """ + def get_image_repl_full( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + features = self.get_image_repl_features(feature_size, num_patches) + return IMG_START + features + IMG_END - def input_processor( + def resolve_min_max_num( self, - ctx: InputContext, - inputs: DecoderOnlyInputs, *, max_dynamic_patch: Optional[int] = None, - ) -> DecoderOnlyInputs: - # get multi_modal_data - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - hf_config = ctx.get_hf_config() - use_MSAC = hf_config.use_msac - - image_data = multi_modal_data["image"] - num_patches = get_internvl_num_patches(hf_config) - - image_pixel_values_mapper = image_to_pixel_values_wrapper( - hf_config, max_dynamic_patch=max_dynamic_patch) - - # single image - if isinstance(image_data, Image.Image): - pixel_values = image_pixel_values_mapper(image_data, - use_MSAC=use_MSAC) - num_blocks = pixel_values.shape[0] - image_feature_sizes = [num_blocks * num_patches] - pixel_values = pixel_values.unsqueeze(0) - - # multi images - elif is_list_of(image_data, Image.Image): - # Do not use MSAC for multi images - image_feature_sizes = [] - pixel_values = [ - image_pixel_values_mapper(image, use_MSAC=False) - for image in image_data - ] - for pixel_value in pixel_values: - num_blocks = pixel_value.shape[0] - image_feature_sizes.append(num_blocks * num_patches) - - # image embeddings as input - elif isinstance(image_data, torch.Tensor): - _, image_feature_size, _ = image_data.shape - image_feature_sizes = [image_feature_size] - pixel_values = None - - # multi-image image embeddings - elif is_list_of(image_data, torch.Tensor): - - image_feature_sizes = [] - for image_embed in image_data: - _, image_feature_size, _ = image_embed.shape - image_feature_sizes.append(image_feature_size) - pixel_values = None + dynamic_image_size: Optional[bool] = None, + use_thumbnail: Optional[bool] = None, + ) -> tuple[int, int]: + min_dynamic_patch = self.min_dynamic_patch + max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch + is None else max_dynamic_patch) + dynamic_image_size = (self.dynamic_image_size if dynamic_image_size + is None else dynamic_image_size) + use_thumbnail = (self.use_thumbnail + if use_thumbnail is None else use_thumbnail) + + return resolve_h2ovl_min_max_num( + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=use_thumbnail, + ) - else: - raise TypeError(f"Invalid image type: {type(image_data)}") + def resolve_target_ratios( + self, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + use_thumbnail: Optional[bool] = None, + prior_aspect_ratio: Optional[tuple[int, int]] = None, + ) -> list[tuple[int, int]]: + min_num, max_num = self.resolve_min_max_num( + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=use_thumbnail, + ) + if prior_aspect_ratio: # hardcoded value for second pass of use_msac + min_num = 3 - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, + return get_h2ovl_target_ratios( + min_num, + max_num, + prior_aspect_ratio=prior_aspect_ratio, ) - prompt = inputs.get("prompt") - prompt_token_ids = inputs["prompt_token_ids"] - if prompt is None: - prompt = tokenizer.decode(prompt_token_ids) - - new_prompt = self._expand_image_prompt(prompt, image_feature_sizes, - num_patches) - new_prompt_token_ids = tokenizer.encode(new_prompt) - - # Wrap image processing in input_processor to avoid duplication - image_token_id = tokenizer.encode( - self.img_context_token, - add_special_tokens=False, - return_tensors="pt", - )[0] - - # Update multi_modal_data to return - if pixel_values is not None: - multi_modal_data = { - "image": { - "pixel_values": pixel_values, - "image_token_id": image_token_id, - } - } + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + use_msac: Optional[bool] = None, + ) -> int: + use_msac = (self.use_msac if use_msac is None else use_msac) + + use_thumbnail = self.use_thumbnail + + if use_msac: + target_ratios_1 = self.resolve_target_ratios( + use_thumbnail=False, # Applied in calculate_targets + ) + num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets( + orig_width=image_width, + orig_height=image_height, + image_size=self.image_size, + target_ratios=target_ratios_1, + use_thumbnail=True, + ) + + target_ratios_2 = self.resolve_target_ratios( + use_thumbnail=False, # Applied in calculate_targets + prior_aspect_ratio=aspect_ratio_1, + ) + num_patches_2, _, _, _ = calculate_h2ovl_targets( + orig_width=image_width, + orig_height=image_height, + image_size=self.image_size, + target_ratios=target_ratios_2, + use_thumbnail=True, + ) + + num_patches = num_patches_1 + num_patches_2 - 1 else: - multi_modal_data = {"image": {"image_embeds": image_data}} + target_ratios = self.resolve_target_ratios( + use_thumbnail=False, # Applied in calculate_targets + ) + num_patches, _, _, _ = calculate_h2ovl_targets( + orig_width=image_width, + orig_height=image_height, + image_size=self.image_size, + target_ratios=target_ratios, + use_thumbnail=use_thumbnail, + ) + + return num_patches * self.num_image_token - return token_inputs( - prompt=prompt, - prompt_token_ids=new_prompt_token_ids, - multi_modal_data=multi_modal_data, + def _images_to_pixel_values_lst( + self, + images: list[Image.Image], + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + ) -> list[torch.Tensor]: + use_msac = self.use_msac if len(images) == 1 else False + + min_num, max_num = self.resolve_min_max_num( + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=False, # Applied in image_to_pixel_values ) - def input_mapper( + return [ + image_to_pixel_values_h2ovl( + image, + input_size=self.image_size, + min_num=min_num, + max_num=max_num, + use_thumbnail=self.use_thumbnail, + use_msac=use_msac, + ) for image in images + ] + + +class H2OVLProcessingInfo(BaseInternVLProcessingInfo): + + def get_hf_processor( self, - ctx: InputContext, - data: object, *, max_dynamic_patch: Optional[int] = None, - ) -> MultiModalKwargs: + dynamic_image_size: Optional[bool] = None, + ) -> H2OVLProcessor: + return H2OVLProcessor( + self.get_hf_config(), + self.get_tokenizer(), + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + max_tokens_one_image = self.get_max_image_tokens(use_msac=None) + if mm_counts.get("image", 0) <= 1: + max_tokens_per_image = max_tokens_one_image + else: + max_tokens_per_image = self.get_max_image_tokens(use_msac=False) + + return {"image": max_tokens_per_image} - # NOTE: Preprocessing for the image data is done in the - # 'input_processor' function during actual inference. - if isinstance(data, dict): - return MultiModalKwargs(data) + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + processor: Optional[H2OVLProcessor], + use_msac: Optional[bool] = None, + ) -> int: + if processor is None: + processor = self.get_hf_processor() + + return processor.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + use_msac=use_msac, + ) - # The section below is only used with dummy data during - # memory profiling. - hf_config = ctx.get_hf_config() + def get_max_image_tokens(self, use_msac: Optional[bool] = None) -> int: + target_width, target_height = self.get_image_size_with_most_features() - image_pixel_values_mapper = image_to_pixel_values_wrapper( - hf_config, max_dynamic_patch) + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + processor=None, + use_msac=use_msac, + ) - if isinstance(data, Image.Image): - pixel_values = image_pixel_values_mapper(data) - pixel_values = pixel_values.unsqueeze(0) - elif is_list_of(data, Image.Image): - hf_config.use_msac = False - pixel_values = [image_pixel_values_mapper(img) for img in data] +class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo] + ): + + def __init__(self, + info: H2OVLProcessingInfo, + dummy_inputs: "BaseDummyInputsBuilder[H2OVLProcessingInfo]", + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True) -> None: + super().__init__( + info, + dummy_inputs, + cache=cache, + enable_sanity_checks=enable_sanity_checks, + ) + + if self.cache is not None: + # The processor output depends on the number of images passed, + # making it incompatible with processing cache which is supposed + # to be invariant of how many images are passed per prompt + self.cache = None + logger.warning_once( + f"{type(self).__name__} does not support processing cache.") + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + if "image_num_patches" in out_mm_kwargs: + image_num_patches = out_mm_kwargs["image_num_patches"] + assert isinstance(image_num_patches, torch.Tensor) + image_num_patches = image_num_patches.tolist() + elif "image_embeds" in out_mm_kwargs: + # TODO: Use image size information in dictionary embedding inputs + # to compute num_patches (similar to Qwen2-VL) + image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) else: - return MultiModalKwargs({"image_embeds": data}) - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, - ) - image_token_id = tokenizer.encode( - self.img_context_token, - add_special_tokens=False, - return_tensors="pt", - )[0] + image_num_patches = [] + + num_images = len(image_num_patches) - return MultiModalKwargs({ - "pixel_values": pixel_values, - "image_token_id": image_token_id - }) + def get_replacement_internvl(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + if isinstance(images, ImageEmbeddingItems): + feature_size = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + feature_size = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor, + use_msac=None if num_images == 1 else False, + ) + + num_patches = image_num_patches[item_idx] + if num_patches is not None: + assert isinstance(num_patches, int) + + return PromptReplacementDetails( + full=hf_processor.get_image_repl_full(feature_size, + num_patches), + features=hf_processor.get_image_repl_features( + feature_size, num_patches), + ) -input_pipeline = H2OVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT) + return [ + PromptReplacement( + modality="image", + target="", + replacement=get_replacement_internvl, + ) + ] -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) -@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data) -@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) +@MULTIMODAL_REGISTRY.register_processor( + H2OVLMultiModalProcessor, + info=H2OVLProcessingInfo, + dummy_inputs=InternVLDummyInputsBuilder) class H2OVLChatModel(InternVLChatModel): def _init_vision_model( diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 1d8f61e9f21029524e0323fc67bbf759a70cf402..7542447a8d40f4f223d421efc5df3f3d84bd4b7e 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py # Copyright 2024 The vLLM team. # Copyright 2024 the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index d16a77f862d98bbfc38e36a58c4ac82999d19577..fdfabbaafce3f90d1e03eff9205f48aa13c350d0 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,35 +16,35 @@ """Inference-only Idefics3 model compatible with HuggingFace weights.""" import math -from typing import (Dict, Iterable, List, Literal, Mapping, NamedTuple, - Optional, Set, Tuple, TypedDict, Union) +from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) import torch import torch.utils.checkpoint -from PIL import Image from torch import nn -# Temporary solution for transformers below 4.46.0. -from transformers import PretrainedConfig as Idefics3Config -from transformers import ProcessorMixin as Idefics3ImageProcessor +from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor, + Idefics3Processor) from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.logger import init_logger from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.inputs import NestedTensors -from vllm.sequence import IntermediateTensors, SequenceData -from vllm.transformers_utils.processor import cached_get_processor -from vllm.utils import is_list_of +from vllm.multimodal.parse import ImageProcessorItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, + MultiModalDataItems, + MultiModalFieldConfig, + PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.sequence import IntermediateTensors # yapf: disable from .idefics2_vision_model import ( @@ -75,307 +77,253 @@ class Idefics3ImageEmbeddingInputs(TypedDict): """ -class Idefics3ProcessorSize(NamedTuple): - """Hashable wrapper for unhashable `size` dict of Idefics3Processor.""" - # NOTE: cached_get_processor/cached_get_image_processor uses lru_cache, - # we need to use NamedTuple instead of TypedDict to avoid hashing issues. - longest_edge: int - - def __contains__(self, key: str) -> bool: - return key in self._asdict() and getattr(self, key) is not None - - def __getitem__(self, key: str) -> int: - return getattr(self, key) - - ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs] -def get_mm_processor_kwargs(size: Optional[Dict[str, int]] = None) -> Dict: - mm_processor_kwargs = {} - if size: - mm_processor_kwargs["size"] = Idefics3ProcessorSize(**size) - return mm_processor_kwargs - - -def input_mapper_for_idefics3( - ctx: InputContext, - data: object, - *, - size: Optional[Dict[str, int]] = None, -): - model_config = ctx.model_config - mm_processor_kwargs = get_mm_processor_kwargs(size) - image_processor = cached_get_image_processor( - model_config.model, - trust_remote_code=model_config.trust_remote_code, - **mm_processor_kwargs) - if image_processor is None: - raise RuntimeError("No HuggingFace processor is available " - "to process the image object") - - if isinstance(data, Image.Image): - images = [[data]] - elif is_list_of(data, Image.Image): - images = [data] - else: - raise TypeError(f"Invalid image type: {type(data)}") - - try: - batch_data = image_processor(images, - return_tensors="pt", - return_row_col_info=True).data - except Exception: - logger.error("Failed to process image (%s)", data) - raise - - return MultiModalKwargs(batch_data) - - -def _resize_output_size(height: int, - width: int, - max_len: Optional[int] = None, - min_len: Optional[int] = 1, - max_size: Optional[int] = None) -> Tuple[int, int]: - # Set default value for max_len if not provided - max_len = max(height, width) if max_len is None else max_len - aspect_ratio = width / height - - # Handle the maximum size constraint - if max_size is not None: - max_len = min(max_len, max_size) - - # Adjust dimensions according to the aspect ratio - if width >= height: - width = max_len - height = int(width / aspect_ratio) - else: - height = max_len - width = int(height * aspect_ratio) - - # Ensure both width and height are even (if needed) - height += 1 if height % 2 != 0 else 0 - width += 1 if width % 2 != 0 else 0 - - # Ensure dimensions are not smaller than the minimum length - height = max(height, min_len) - width = max(width, min_len) - - return height, width - - -def _get_resize_output_image_size( - image_size: Tuple[int, int], - resolution_max_side: int, - max_image_size: int = 1820, -) -> Tuple[int, int]: - if resolution_max_side > max_image_size: - raise ValueError( - "`resolution_max_side` cannot be larger than `max_image_size`") - - height, width = image_size - - # Find the output size, when rescaling the longest edge to max_len and - # preserving the aspect ratio - height, width = _resize_output_size(height, - width, - max_len=resolution_max_side) - - return height, width - - -def _prompt_split_image(image_seq_len: int, image_rows: int, image_cols: int, - fake_token_around_image: str, image_token: str, - global_img_token: str) -> str: - """ - Prompt with expanded image tokens for when the image is split - into patches. - """ - text_split_images = "" - for n_h in range(image_rows): - for n_w in range(image_cols): - text_split_images += (fake_token_around_image + - f"" + - image_token * image_seq_len) - text_split_images += "\n" - - text_split_images += "\n" + _prompt_single_image( - image_seq_len=image_seq_len, - fake_token_around_image=fake_token_around_image, - image_token=image_token, - global_img_token=global_img_token) - return text_split_images - - -def _prompt_single_image(image_seq_len: int, fake_token_around_image: str, - image_token: str, global_img_token: str): - """Prompt with expanded image tokens for a single image.""" - return (fake_token_around_image + global_img_token + - image_token * image_seq_len + fake_token_around_image) - - -def _get_image_prompt_string(image_rows: int, image_cols: int, - image_seq_len: int, fake_token_around_image: str, - image_token: str, global_img_token: str): - if image_rows == 0 and image_cols == 0: - return _prompt_single_image( - image_seq_len=image_seq_len, - fake_token_around_image=fake_token_around_image, - image_token=image_token, - global_img_token=global_img_token, - ) - return _prompt_split_image(image_seq_len, image_rows, image_cols, - fake_token_around_image, image_token, - global_img_token) - - -def input_processor_for_idefics3(ctx: InputContext, - inputs: DecoderOnlyInputs, - *, - size: Optional[Dict[str, int]] = None): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - mm_processor_kwargs = get_mm_processor_kwargs(size) - processor = cached_get_processor(model_config.model, **mm_processor_kwargs) - image_processor = processor.image_processor - tokenizer = processor.tokenizer - size = image_processor.size['longest_edge'] - max_image_size = image_processor.max_image_size['longest_edge'] - - image_data = multi_modal_data["image"] - if isinstance(image_data, Image.Image): - image_list = [image_data] - elif is_list_of(image_data, Image.Image): - image_list = image_data - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - image_rows = [] - image_cols = [] - for image in image_list: - height, width = _get_resize_output_image_size(image.size, size) - - rows = math.ceil(height / max_image_size) - cols = math.ceil(width / max_image_size) - image_rows.append(rows) - image_cols.append(cols) - image_rows = [image_rows] - image_cols = [image_cols] - - n_images_in_text = [] - - text = inputs.get("prompt") - if text is None: - prompt_token_ids = inputs.get("prompt_token_ids", []) - assert prompt_token_ids - text = tokenizer.decode(prompt_token_ids) - - if isinstance(text, str): - text = [text] - elif not isinstance(text, list) and not isinstance(text[0], str): - raise ValueError("Invalid input text. Please provide a string, " - "or a list of strings") - - fake_image_token = processor.fake_image_token.content - image_token = processor.image_token.content - global_img_token = processor.global_image_tag - - prompt_strings = [] - for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols): - n_images_in_text.append(sample.count(image_token)) - - # Replace the image token with fake tokens around the expanded - # image token sequence of length `image_seq_len` - image_prompt_strings = [] - for n_rows, n_cols in zip(sample_rows, sample_cols): - image_prompt_string = _get_image_prompt_string( - n_rows, - n_cols, - processor.image_seq_len, - image_token=image_token, - fake_token_around_image=fake_image_token, - global_img_token=global_img_token, - ) - image_prompt_strings.append(image_prompt_string) - - split_sample = sample.split(image_token) - if len(split_sample) == 0: - raise ValueError("The image token should be present in the text.") - - # Place in the image prompt strings where the image tokens are - sample = split_sample[0] - for i, image_prompt_string in enumerate(image_prompt_strings): - sample += image_prompt_string + split_sample[i + 1] - prompt_strings.append(sample) +class Idefics3ProcessingInfo(BaseProcessingInfo): - prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids + def get_hf_processor( + self, + *, + size: Optional[Dict[str, int]] = None) -> Idefics3Processor: + if size is not None: + return self.ctx.get_hf_processor(Idefics3Processor, size=size) - return token_inputs( - prompt_token_ids=prompt_token_ids, - prompt=prompt_strings[0], - multi_modal_data=multi_modal_data, - ) + return self.ctx.get_hf_processor(Idefics3Processor) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} -def _get_max_num_image_patch(image_processor: Idefics3ImageProcessor) -> int: - size = image_processor.size['longest_edge'] - max_image_size = image_processor.max_image_size['longest_edge'] - resized_height, resized_width = size, size + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + hf_processor = self.get_hf_processor() + image_processor: Idefics3ImageProcessor = hf_processor.image_processor + grid_w, grid_h = self._get_image_feature_grid_size( + image_width=image_processor.size['longest_edge'], + image_height=image_processor.size['longest_edge'], + ) + num_image_token = (grid_w * grid_h + 1) * hf_processor.image_seq_len + # Calculate Non-image-token length + # NOTE: and are special token for SmolVLM + # but not for Idefic3, so we need to tokenize them to get actual length. + tokenizer = self.get_tokenizer() + tile_token_len = len(tokenizer.tokenize("")) + glob_token_len = len(tokenizer.tokenize(hf_processor.global_image_tag)) + # linebreak and always cost 1 token + fake_token_len = lb_len = 1 + non_image_token = (grid_w * grid_h) * ( + tile_token_len + fake_token_len) + glob_token_len + ( + grid_h + 1) * lb_len + fake_token_len + return {"image": num_image_token + non_image_token} + + def _resize_output_size(self, + *, + height: int, + width: int, + max_len: Optional[int] = None, + min_len: Optional[int] = 1, + max_size: Optional[int] = None) -> tuple[int, int]: + # Set default value for max_len if not provided + max_len = max(height, width) if max_len is None else max_len + aspect_ratio = width / height + + # Handle the maximum size constraint + if max_size is not None: + max_len = min(max_len, max_size) + + # Adjust dimensions according to the aspect ratio + if width >= height: + width = max_len + height = int(width / aspect_ratio) + else: + height = max_len + width = int(height * aspect_ratio) - grid_h = resized_height // max_image_size - grid_w = resized_width // max_image_size - return (grid_h * grid_w + 1) + # Ensure both width and height are even (if needed) + height += height % 2 + width += width % 2 + # Ensure dimensions are not smaller than the minimum length + height = max(height, min_len) + width = max(width, min_len) -def get_max_idefics3_image_tokens(ctx: InputContext, - *, - size: Optional[Dict[str, - int]] = None) -> int: - model_config = ctx.model_config - mm_processor_kwargs = get_mm_processor_kwargs(size) - processor = cached_get_processor(model_config.model, **mm_processor_kwargs) - image_seq_len = processor.image_seq_len - image_processor = processor.image_processor + return height, width - max_num_image_patches = _get_max_num_image_patch(image_processor) + def _get_resize_output_image_size( + self, + *, + image_width: int, + image_height: int, + resolution_max_side: int, + ) -> tuple[int, int]: + hf_processor = self.get_hf_processor() + image_processor: Idefics3ImageProcessor = hf_processor.image_processor + max_image_size = image_processor.size['longest_edge'] + if resolution_max_side > max_image_size: + raise ValueError( + "`resolution_max_side` cannot be larger than `max_image_size`") + + height, width = image_height, image_width + + # Find the output size, when rescaling the longest edge to max_len and + # preserving the aspect ratio + height, width = self._resize_output_size(height=height, + width=width, + max_len=resolution_max_side) + return height, width + + def _get_image_feature_grid_size( + self, + *, + image_width: int, + image_height: int, + size: Optional[dict[str, object]] = None, + ) -> tuple[int, int]: + hf_processor = self.get_hf_processor(size=size) + image_processor: Idefics3ImageProcessor = hf_processor.image_processor + max_image_size = image_processor.max_image_size['longest_edge'] + size = image_processor.size['longest_edge'] + assert size % max_image_size == 0, ( + "`longest_edge` in image_processor's `size` must be divisible by " + "`longest_edge` in `max_image_size`, this may be caused by " + "incorrect mm_kwargs override.") + + resized_height, resized_width = self._get_resize_output_image_size( + image_width=image_width, + image_height=image_height, + resolution_max_side=size, + ) + if resized_height > max_image_size or resized_width > max_image_size: + grid_h = math.ceil(resized_height / max_image_size) + grid_w = math.ceil(resized_width / max_image_size) + else: + grid_h = grid_w = 0 + return grid_w, grid_h - return max_num_image_patches * image_seq_len +class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo] + ): -def dummy_data_for_idefics3( - ctx: InputContext, + def get_dummy_processor_inputs( + self, seq_len: int, mm_counts: Mapping[str, int], - *, - size: Optional[Dict[str, int]] = None) -> DummyData: - hf_config = ctx.get_hf_config() - num_images = mm_counts["image"] + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + hf_processor = self.info.get_hf_processor() + image_processor: Idefics3ImageProcessor = hf_processor.image_processor + longest_edge = image_processor.max_image_size['longest_edge'] + image_token: str = hf_processor.image_token.content + + mm_data = { + "image": + self._get_dummy_images(width=longest_edge, + height=longest_edge, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, + ) - mm_processor_kwargs = get_mm_processor_kwargs(size) - processor = cached_get_processor(ctx.model_config.model, - **mm_processor_kwargs) - max_num_image_patches = _get_max_num_image_patch(processor.image_processor) - image_seq_len = processor.image_seq_len - max_llm_image_tokens = max_num_image_patches * image_seq_len * num_images - if seq_len - max_llm_image_tokens < 0: - raise RuntimeError( - f"Idefics3 cannot process {num_images} images in a prompt, " - "please increase max_model_len or reduce image limit by " - "--limit-mm-per-prompt.") +class Idefics3MultimodalProcessor( + BaseMultiModalProcessor[Idefics3ProcessingInfo]): - seq_data = SequenceData.from_prompt_token_counts( - (hf_config.image_token_id, max_llm_image_tokens), - (0, seq_len - max_llm_image_tokens)) + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + if mm_data: + processed_outputs = super()._call_hf_processor( + prompt, mm_data, mm_kwargs) + image_grids = [ + self.info._get_image_feature_grid_size( + image_width=img.width, + image_height=img.height, + **mm_kwargs, + ) for img in mm_data["images"] + ] + image_patches = list(map(lambda x: math.prod(x) + 1, image_grids)) + for key in ("pixel_values", "pixel_attention_mask"): + data = processed_outputs.pop(key) + data = data.flatten(0, 1).split(image_patches) + processed_outputs[key] = data + else: + tokenizer = self.info.get_tokenizer() + processed_outputs = tokenizer(prompt, + add_special_tokens=True, + return_tensors="pt") + return processed_outputs - width = height = hf_config.vision_config.image_size - image = Image.new("RGB", (width, height), color=0) - mm_data = {"image": [image] if num_images == 1 else [image] * num_images} + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + pixel_attention_mask=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) - return DummyData(seq_data, mm_data) + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + image_token = hf_processor.image_token.content + fake_image_token = hf_processor.fake_image_token.content + global_img_token = hf_processor.global_image_tag + image_seq_len = hf_processor.image_seq_len + grid_placeholder = "" + + p_img = image_token * image_seq_len + global_img_placeholder = fake_image_token + global_img_token + p_img + tile_img_placeholder = fake_image_token + grid_placeholder + p_img + + def get_replacement_idefics3(item_idx: int) -> str: + images = mm_items.get_items("image", ImageProcessorItems) + + image_size = images.get_image_size(item_idx) + grid_w, grid_h = self.info._get_image_feature_grid_size( + image_width=image_size.width, + image_height=image_size.height, + **hf_processor_mm_kwargs, + ) + if grid_w == 0 and grid_h == 0: + image_placeholder = global_img_placeholder + else: + tiles_placeholder = list[str]() + for i in range(grid_h): + for j in range(grid_w): + placeholder_per_tile = tile_img_placeholder.format( + n_h=i + 1, n_w=j + 1) + tiles_placeholder.append(placeholder_per_tile) + # Add line break if it is the last tile in the row + if j == grid_w - 1: + tiles_placeholder.append("\n") + + image_placeholder = "".join( + [*tiles_placeholder, "\n", global_img_placeholder]) + return image_placeholder + fake_image_token + + return [ + PromptReplacement( + modality="image", + target=image_token, + replacement=get_replacement_idefics3, + ) + ] class Idefics3SimpleMLP(nn.Module): @@ -451,7 +399,7 @@ class Idefics3Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config + config: Idefics3Config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config self.config = config @@ -539,15 +487,13 @@ class Idefics3Model(nn.Module): self, pixel_values: torch.Tensor, pixel_attention_mask: Optional[torch.BoolTensor] = None, - ) -> torch.Tensor: + ) -> NestedTensors: # NOTE: we skip the step to select the vision feature layer since # this is already done inside the vision tower - batch_size, num_images, num_channels, height, width = pixel_values.shape + num_patches = [x.size(0) for x in pixel_values] pixel_values = pixel_values.to( dtype=self.vision_model.embeddings.patch_embedding.weight.dtype ) # fp16 compatibility - pixel_values = pixel_values.view(batch_size * num_images, - *pixel_values.shape[2:]) # Remove padding images - padding images are full 0. nb_values_per_image = pixel_values.shape[1:].numel() @@ -565,8 +511,6 @@ class Idefics3Model(nn.Module): ) else: # Remove padding images from the mask - pixel_attention_mask = pixel_attention_mask.view( - batch_size * num_images, *pixel_attention_mask.shape[2:]) pixel_attention_mask = pixel_attention_mask[ real_images_inds].contiguous() @@ -585,10 +529,10 @@ class Idefics3Model(nn.Module): patch_attention_mask=patch_attention_mask, ) - return image_hidden_states + return image_hidden_states.split(num_patches) def _process_image_pixels( - self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor: + self, inputs: Idefics3ImagePixelInputs) -> NestedTensors: assert self.vision_model is not None pixel_values = inputs["data"] @@ -603,7 +547,9 @@ class Idefics3Model(nn.Module): assert self.vision_model is not None image_features = self._process_image_pixels(image_input) - return self.connector(image_features) + num_patches = [x.size(0) for x in image_features] + image_features = torch.cat(image_features) + return self.connector(image_features).split(num_patches) def get_input_embeddings( self, @@ -632,10 +578,10 @@ class Idefics3Model(nn.Module): return hidden_states -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_idefics3) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_idefics3_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_idefics3) -@INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3) +@MULTIMODAL_REGISTRY.register_processor( + Idefics3MultimodalProcessor, + info=Idefics3ProcessingInfo, + dummy_inputs=Idefics3DummyInputsBuilder) class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA): packed_modules_mapping = { @@ -687,7 +633,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, if self.config.text_config.tie_word_embeddings: self.lm_head.weight = self.model.text_model.wte.weight self.logits_processor = LogitsProcessor(config.text_config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self.model._parse_and_validate_image_input(**kwargs) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index ac559686dda5946994b1061cc5455ef53745ee97..6ad67fd95b95f45e8a8a395746b7bb1729270ae5 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, Protocol, Type, Union, overload, runtime_checkable) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 37b91a803d71ea0139f1f1f7b1ff18dbb1b8a20f..c5f7be135d71ab7a3a7336a41d894fe9a11b92d7 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import (TYPE_CHECKING, List, Optional, Protocol, Type, Union, overload, runtime_checkable) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 96deaff61e6e628ec7945ebbb12bb8cc1e45acdd..cb41df894f3c93254c740f70a8ff6ffab5b62da8 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py # -------------------------------------------------------- # InternVL diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 28c23edd4c8e8d9f857f7ea01e053507c314ea08..c211ca5f4f8e9b556feda010b5616f7a7579e2c9 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import partial from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 93ac2dcf8d5871abe96d2d8e6b403b4213d93a5b..106c3b6b78cc8c87417f9132c76e0f420f489f62 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple, Union import torch diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index f4b7e4478c1647d64673cb82d05461608a4ece08..380eb40d9eb2823e0bdab63cf010fd9838fe7ac7 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1,38 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py # -------------------------------------------------------- # InternVL # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- -import re -from functools import cached_property, partial +from abc import ABC, abstractmethod +from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) + TypedDict, TypeVar, Union) import torch import torch.nn as nn import torchvision.transforms as T from PIL import Image -from transformers import PretrainedConfig +from transformers import BatchFeature, PretrainedConfig, TensorType from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.inputs import NestedTensors, PlaceholderRange -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from vllm.utils import is_list_of +from vllm.transformers_utils.tokenizer import AnyTokenizer -from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, - get_clip_num_patches) from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -73,22 +77,27 @@ InternVLImageInputs = Union[InternVLImagePixelInputs, InternVLImageEmbeddingInputs] -# copied from https://huggingface.co/OpenGVLab/InternVL2-1B -def build_transform(input_size): +# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B +def build_transform(input_size: int): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD - transform = T.Compose([ + return T.Compose([ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), T.Resize((input_size, input_size), interpolation=T.InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=MEAN, std=STD) ]) - return transform -# copied from https://huggingface.co/OpenGVLab/InternVL2-1B -def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, - image_size): +# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B +def find_closest_aspect_ratio( + aspect_ratio: float, + target_ratios: list[tuple[int, int]], + *, + width: int, + height: int, + image_size: int, +) -> tuple[int, int]: best_ratio_diff = float('inf') best_ratio = (1, 1) area = width * height @@ -104,67 +113,82 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, return best_ratio -def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int, - max_num: int, image_size: int, - use_thumbnail: bool) -> Tuple[int, int, int]: - aspect_ratio = orig_width / orig_height +def resolve_internvl_min_max_num( + *, + min_dynamic_patch: int, + max_dynamic_patch: int, + dynamic_image_size: bool, + use_thumbnail: bool, +) -> tuple[int, int]: + max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 + + if use_thumbnail and max_dynamic_patch != 1: + max_dynamic_patch += 1 + + return min_dynamic_patch, max_dynamic_patch - # calculate the existing image aspect ratio - target_ratios = set((i, j) for n in range(min_num, max_num + 1) - for i in range(1, n + 1) for j in range(1, n + 1) - if i * j <= max_num and i * j >= min_num) - target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + +def get_internvl_target_ratios( + min_num: int, + max_num: int, +) -> list[tuple[int, int]]: + target_ratios = {(i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) if min_num <= i * j <= max_num} + return sorted(target_ratios, key=lambda x: x[0] * x[1]) + + +def calculate_internvl_targets( + *, + orig_width: int, + orig_height: int, + target_ratios: list[tuple[int, int]], + image_size: int, + use_thumbnail: bool, +) -> tuple[int, int, int]: + aspect_ratio = orig_width / orig_height # find the closest aspect ratio to the target - target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, - target_ratios, orig_width, - orig_height, image_size) + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, + target_ratios, + width=orig_width, + height=orig_height, + image_size=image_size, + ) # calculate the target width and height target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] blocks = target_aspect_ratio[0] * target_aspect_ratio[1] - # add thumbnail image if num_blocks > 1 - if use_thumbnail and blocks > 1: - blocks += 1 - return blocks, target_width, target_height - -def calculate_num_blocks_wrapper( - hf_config: PretrainedConfig, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, -): - if dynamic_image_size is None: - dynamic_image_size = hf_config.dynamic_image_size + # add thumbnail image if num_blocks != 1 + if use_thumbnail and blocks != 1: + blocks += 1 - max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 - if max_dynamic_patch is None: - max_dynamic_patch = hf_config.max_dynamic_patch - min_num = hf_config.min_dynamic_patch - image_size = hf_config.vision_config.image_size - use_thumbnail = hf_config.use_thumbnail - return partial(calculate_num_blocks, - min_num=min_num, - max_num=max_dynamic_patch, - image_size=image_size, - use_thumbnail=use_thumbnail) + return blocks, target_width, target_height # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int, - image_size: int, - use_thumbnail: bool) -> List[Image.Image]: +def dynamic_preprocess_internvl( + image: Image.Image, + *, + target_ratios: list[tuple[int, int]], + image_size: int, + use_thumbnail: bool, +) -> list[Image.Image]: orig_width, orig_height = image.size # calculate the number of blocks without thumbnail - blocks, target_width, target_height = calculate_num_blocks( - orig_width, - orig_height, - min_num, - max_num, - image_size, - use_thumbnail=False) + blocks, target_width, target_height = calculate_internvl_targets( + orig_width=orig_width, + orig_height=orig_height, + target_ratios=target_ratios, + image_size=image_size, + use_thumbnail=False, + ) + # resize the image resized_img = image.resize((target_width, target_height)) processed_images = [] @@ -176,301 +200,463 @@ def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int, # split the image split_img = resized_img.crop(box) processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: thumbnail_img = image.resize((image_size, image_size)) processed_images.append(thumbnail_img) + return processed_images # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -def image_to_pixel_values(image: Image.Image, input_size: int, min_num: int, - max_num: int, use_thumbnail: bool) -> torch.Tensor: +def image_to_pixel_values_internvl( + image: Image.Image, + *, + input_size: int, + min_num: int, + max_num: int, + use_thumbnail: bool, +) -> torch.Tensor: + target_ratios = get_internvl_target_ratios(min_num, max_num) + transform = build_transform(input_size=input_size) - images = dynamic_preprocess(image, - min_num=min_num, - max_num=max_num, - image_size=input_size, - use_thumbnail=use_thumbnail) - pixel_values = [transform(image) for image in images] - pixel_values = torch.stack(pixel_values) + images = dynamic_preprocess_internvl( + image, + target_ratios=target_ratios, + image_size=input_size, + use_thumbnail=use_thumbnail, + ) + + pixel_values = torch.stack([transform(image) for image in images]) return pixel_values -def image_to_pixel_values_wrapper( - hf_config: PretrainedConfig, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, -): - image_size = hf_config.vision_config.image_size - min_num = hf_config.min_dynamic_patch - if dynamic_image_size is None: - dynamic_image_size = hf_config.dynamic_image_size +class BaseInternVLProcessor(ABC): + """ + This model doesn't define its own HF processor, + so we implement our own one here. - max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 - if max_dynamic_patch is None: - max_dynamic_patch = hf_config.max_dynamic_patch - use_thumbnail = hf_config.use_thumbnail - return partial(image_to_pixel_values, - input_size=image_size, - min_num=min_num, - max_num=max_dynamic_patch, - use_thumbnail=use_thumbnail) - - -def get_internvl_num_patches(hf_config: PretrainedConfig): - vision_config = hf_config.vision_config - downsample_ratio = hf_config.downsample_ratio - image_size = vision_config.image_size - patch_size = vision_config.patch_size - return int( - get_clip_num_patches(image_size=image_size, patch_size=patch_size) * - (downsample_ratio**2)) - - -def get_max_internvl_image_tokens( - ctx: InputContext, - *, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, -): - hf_config = ctx.get_hf_config() - if dynamic_image_size is None: - dynamic_image_size = hf_config.dynamic_image_size + The code to insert image tokens is based on: + https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252 + """ - max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 - if max_dynamic_patch is None: - max_dynamic_patch = hf_config.max_dynamic_patch - use_thumbnail = hf_config.use_thumbnail - if use_thumbnail and max_dynamic_patch > 1: - max_dynamic_patch += 1 + def __init__( + self, + config: PretrainedConfig, + tokenizer: AnyTokenizer, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + ) -> None: + super().__init__() - num_patches = get_internvl_num_patches(hf_config) - return num_patches * max_dynamic_patch + self.config = config + self.tokenizer = tokenizer + image_size: int = config.vision_config.image_size + patch_size: int = config.vision_config.patch_size -def get_max_internvl_image_size( - ctx: InputContext, - *, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, -): - hf_config = ctx.get_hf_config() - image_size = hf_config.vision_config.image_size - if dynamic_image_size is None: - dynamic_image_size = hf_config.dynamic_image_size + if dynamic_image_size is None: + dynamic_image_size = config.dynamic_image_size + assert isinstance(dynamic_image_size, bool) - max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 - if max_dynamic_patch is None: - max_dynamic_patch = hf_config.max_dynamic_patch - use_thumbnail = hf_config.use_thumbnail - if use_thumbnail and max_dynamic_patch > 1: - max_dynamic_patch += 1 - width = image_size * max_dynamic_patch - height = image_size - return width, height + if max_dynamic_patch is None: + max_dynamic_patch = config.max_dynamic_patch + assert isinstance(max_dynamic_patch, int) + self.num_image_token = int( + (image_size // patch_size)**2 * (config.downsample_ratio**2)) + self.image_size = image_size + self.min_dynamic_patch: int = config.min_dynamic_patch + self.max_dynamic_patch = max_dynamic_patch + self.dynamic_image_size = dynamic_image_size + self.use_thumbnail: bool = config.use_thumbnail + + @property + @abstractmethod + def image_token_id(self) -> int: + raise NotImplementedError + + @abstractmethod + def get_image_repl_features( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + raise NotImplementedError -class InternVLInputPipeline: + @abstractmethod + def get_image_repl_full( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + raise NotImplementedError - def __init__( + def resolve_min_max_num( self, - img_start_token: str, - img_end_token: str, - img_context_token: str, - ) -> None: - super().__init__() + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + use_thumbnail: Optional[bool] = None, + ) -> tuple[int, int]: + min_dynamic_patch = self.min_dynamic_patch + max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch + is None else max_dynamic_patch) + dynamic_image_size = (self.dynamic_image_size if dynamic_image_size + is None else dynamic_image_size) + use_thumbnail = (self.use_thumbnail + if use_thumbnail is None else use_thumbnail) + + return resolve_internvl_min_max_num( + min_dynamic_patch=min_dynamic_patch, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=use_thumbnail, + ) - self.img_start_token = img_start_token - self.img_end_token = img_end_token - self.img_context_token = img_context_token + def resolve_target_ratios( + self, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + use_thumbnail: Optional[bool] = None, + ) -> list[tuple[int, int]]: + min_num, max_num = self.resolve_min_max_num( + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=use_thumbnail, + ) - def _create_image_prompt(self, feature_size: int, num_patches: int) -> str: - return (self.img_start_token + self.img_context_token * feature_size + - self.img_end_token) + return get_internvl_target_ratios(min_num, max_num) - def _expand_image_prompt( + def get_num_image_tokens( self, - prompt: str, - feature_sizes: List[int], - num_patches: int, - ) -> str: - image_idx = sorted( - map(int, re.findall(r"Image-(\d+): \n", prompt))) + *, + image_width: int, + image_height: int, + ) -> int: + target_ratios = self.resolve_target_ratios( + use_thumbnail=False, # Applied in calculate_targets + ) - new_prompt = prompt - for idx, feature_size in enumerate(feature_sizes, start=1): - image_prompt = self._create_image_prompt(feature_size, num_patches) - if not image_idx: - image_prompt = f"Image-{idx}: {image_prompt}" + num_patches, _, _ = calculate_internvl_targets( + orig_width=image_width, + orig_height=image_height, + image_size=self.image_size, + target_ratios=target_ratios, + use_thumbnail=self.use_thumbnail, + ) - new_prompt = new_prompt.replace('', image_prompt, 1) + return num_patches * self.num_image_token - return new_prompt + def _images_to_pixel_values_lst( + self, + images: list[Image.Image], + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + ) -> list[torch.Tensor]: + min_num, max_num = self.resolve_min_max_num( + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + use_thumbnail=False, # Applied in image_to_pixel_values + ) + + return [ + image_to_pixel_values_internvl( + image, + input_size=self.image_size, + min_num=min_num, + max_num=max_num, + use_thumbnail=self.use_thumbnail, + ) for image in images + ] - def input_processor( + def __call__( self, - ctx: InputContext, - inputs: DecoderOnlyInputs, - *, + text: Optional[Union[str, list[str]]] = None, + images: Optional[Union[Image.Image, list[Image.Image]]] = None, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, - ) -> DecoderOnlyInputs: - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - hf_config = ctx.get_hf_config() - - image_data = multi_modal_data["image"] - num_patches = get_internvl_num_patches(hf_config) - num_blocks_calculator = calculate_num_blocks_wrapper( - hf_config, max_dynamic_patch, dynamic_image_size) - if isinstance(image_data, Image.Image): - width, height = image_data.size - num_blocks, _, _ = num_blocks_calculator(width, height) - image_feature_sizes = [num_blocks * num_patches] - elif is_list_of(image_data, Image.Image): - image_feature_sizes = [] - for image in image_data: - width, height = image.size - num_blocks, _, _ = num_blocks_calculator(width, height) - image_feature_sizes.append(num_blocks * num_patches) - elif isinstance(image_data, torch.Tensor): - num_images, image_feature_size, hidden_size = image_data.shape - image_feature_sizes = [image_feature_size] + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> BatchFeature: + if text is None: + text = [] + if not isinstance(text, list): + text = [text] + if images is None: + images = [] + if not isinstance(images, list): + images = [images] + + if len(images) == 0: + image_inputs = {} else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - - prompt = inputs.get("prompt") - prompt_token_ids = inputs["prompt_token_ids"] - if prompt is None: - prompt = tokenizer.decode(prompt_token_ids) - - new_prompt = self._expand_image_prompt(prompt, image_feature_sizes, - num_patches) - new_prompt_token_ids = tokenizer.encode(new_prompt) - img_context_token_id = tokenizer.encode(self.img_context_token, - add_special_tokens=False) - assert len(img_context_token_id) == 1, \ - (f"Invalid image token '{self.img_context_token}': A valid image " - f"token encodes to a single token ID, got {img_context_token_id}.") - img_context_token_id = img_context_token_id[0] - - # Get precise tracking of placeholder positions - token_idx = image_idx = 0 - placeholder_ranges = [] - while token_idx < len(new_prompt_token_ids): - if new_prompt_token_ids[token_idx] == img_context_token_id: - curr_image_featue_size = image_feature_sizes[image_idx] - placeholder_ranges.append( - PlaceholderRange(offset=token_idx, - length=curr_image_featue_size)) - image_idx += 1 - token_idx += curr_image_featue_size - else: - token_idx += 1 + pixel_values_lst = self._images_to_pixel_values_lst( + images, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + image_inputs = { + "pixel_values_flat": torch.cat(pixel_values_lst), + "image_num_patches": list(map(len, pixel_values_lst)), + } + + for pixel_values in pixel_values_lst: + num_patches = pixel_values.shape[0] + feature_size = num_patches * self.num_image_token + + image_repl = self.get_image_repl_full(feature_size, + num_patches) + text = [t.replace('', image_repl, 1) for t in text] + + text_inputs = self.tokenizer(text) + + return BatchFeature( + { + **text_inputs, + **image_inputs, + }, + tensor_type=return_tensors, + ) - return token_inputs( - prompt=prompt, - prompt_token_ids=new_prompt_token_ids, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"image": placeholder_ranges}) - def input_mapper( +class InternVLProcessor(BaseInternVLProcessor): + + @property + def image_token_id(self) -> int: + return self.tokenizer.get_vocab()[IMG_CONTEXT] + + def get_image_repl_features( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + return IMG_CONTEXT * feature_size + + def get_image_repl_full( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + features = self.get_image_repl_features(feature_size, num_patches) + return IMG_START + features + IMG_END + + +class BaseInternVLProcessingInfo(BaseProcessingInfo): + + @abstractmethod + def get_hf_processor( self, - ctx: InputContext, - data: object, *, max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, - ): - hf_config = ctx.get_hf_config() - - image_pixel_values_mapper = image_to_pixel_values_wrapper( - hf_config, max_dynamic_patch, dynamic_image_size) - if isinstance(data, Image.Image): - data = image_pixel_values_mapper(data) - # Add an N dimension for number of images per prompt (currently 1). - data = data.unsqueeze(0) - elif is_list_of(data, Image.Image): - # we can't stack here because images may have different num_patches - data = [image_pixel_values_mapper(img) for img in data] - else: - return MultiModalKwargs({"image_embeds": data}) - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - image_token_id = tokenizer.encode(self.img_context_token, - add_special_tokens=False, - return_tensors="pt")[0] - - return MultiModalKwargs({ - "pixel_values": data, - "image_token_id": image_token_id - }) - - def dummy_data( + ) -> BaseInternVLProcessor: + raise NotImplementedError + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item( self, - ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + return {"image": self.get_max_image_tokens()} + + def get_num_image_tokens( + self, *, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - ): - num_images = mm_counts["image"] + image_width: int, + image_height: int, + processor: Optional[BaseInternVLProcessor], + ) -> int: + if processor is None: + processor = self.get_hf_processor() + + return processor.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ) - hf_config = ctx.get_hf_config() + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() - image_feature_size = get_max_internvl_image_tokens( - ctx, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + processor=None, ) - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - - seq_data, ranges = dummy_seq_data_for_clip( - hf_config.vision_config, - seq_len, - num_images, - image_token_id=tokenizer.encode(self.img_context_token, - add_special_tokens=False)[0], - image_feature_size_override=image_feature_size, + + def get_image_size_with_most_features(self) -> ImageSize: + processor = self.get_hf_processor() + + base_size = processor.image_size + target_ratios = processor.resolve_target_ratios() + + largest_feature_size, largest_feature_pinpoint = 0, None + for wr, hr in target_ratios: + width, height = base_size * wr, base_size * hr + + feat_size = self.get_num_image_tokens( + image_width=width, + image_height=height, + processor=processor, + ) + if feat_size > largest_feature_size: + largest_feature_size = feat_size + largest_feature_pinpoint = ImageSize(width=width, + height=height) + + if largest_feature_size == 0 or largest_feature_pinpoint is None: + raise ValueError("Cannot have a largest feature size of 0!") + + return largest_feature_pinpoint + + +_I = TypeVar("_I", bound=BaseInternVLProcessingInfo) + + +class InternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + target_width, target_height = \ + self.info.get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="" * num_images, + mm_data=mm_data, ) - max_image_width, max_image_height = get_max_internvl_image_size( - ctx, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, + +class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, ) - mm_data = dummy_image_for_clip( - hf_config.vision_config, - num_images, - image_width_override=max_image_width, - image_height_override=max_image_height, + image_token_id = self.info.get_hf_processor(**mm_kwargs).image_token_id + image_data = mm_data.get("images", []) + assert isinstance(image_data, list) + + # Since there may be extra tokens in the feature placeholders, + # we need to pass the image token ID to the model to select the + # tokens to merge from the vision encoder outputs + processed_outputs["image_token_id"] = torch.tensor(image_token_id) + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0)) + num_images = len(image_num_patches) + + return dict( + pixel_values_flat=MultiModalFieldConfig.flat_from_sizes( + "image", image_num_patches), + image_num_patches=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + image_token_id=MultiModalFieldConfig.shared("image", num_images), ) - return DummyData(seq_data, mm_data, ranges) + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + if "image_num_patches" in out_mm_kwargs: + image_num_patches = out_mm_kwargs["image_num_patches"] + assert isinstance(image_num_patches, torch.Tensor) + image_num_patches = image_num_patches.tolist() + elif "image_embeds" in out_mm_kwargs: + # TODO: Use image size information in dictionary embedding inputs + # to compute num_patches (similar to Qwen2-VL) + image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) + else: + image_num_patches = [] + + def get_replacement_internvl(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + feature_size = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + feature_size = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor, + ) + + num_patches = image_num_patches[item_idx] + if num_patches is not None: + assert isinstance(num_patches, int) + + return PromptReplacementDetails( + full=hf_processor.get_image_repl_full(feature_size, + num_patches), + features=hf_processor.get_image_repl_features( + feature_size, num_patches), + ) + return [ + PromptReplacement( + modality="image", + target="", + replacement=get_replacement_internvl, + ) + ] -input_pipeline = InternVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT) +class InternVLProcessingInfo(BaseInternVLProcessingInfo): -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) -@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data) -@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) + def get_hf_processor( + self, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + ) -> InternVLProcessor: + return InternVLProcessor( + self.get_hf_config(), + self.get_tokenizer(), + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + + +@MULTIMODAL_REGISTRY.register_processor( + InternVLMultiModalProcessor, + info=InternVLProcessingInfo, + dummy_inputs=InternVLDummyInputsBuilder) class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: @@ -619,11 +805,11 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[InternVLImageInputs]: - pixel_values = kwargs.pop("pixel_values", None) - image_token_id = kwargs.pop("image_token_id", None) + pixel_values_flat = kwargs.pop("pixel_values_flat", None) + image_num_patches = kwargs.pop("image_num_patches", None) image_embeds = kwargs.pop("image_embeds", None) - if pixel_values is None and image_embeds is None: + if pixel_values_flat is None and image_embeds is None: return None if image_embeds is not None: @@ -636,31 +822,30 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): data=flatten_bn(image_embeds), ) - self.img_context_token_id = image_token_id[0] + image_token_id = kwargs["image_token_id"] + assert isinstance(image_token_id, torch.Tensor) + self.img_context_token_id = image_token_id.flatten().unique().item() - if pixel_values is not None: - if not isinstance(pixel_values, (torch.Tensor, list)): + if pixel_values_flat is not None: + if not isinstance(pixel_values_flat, (torch.Tensor, list)): raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") - - patches_per_image = [] - for request_pixel_values in pixel_values: - for image_pixel_values in request_pixel_values: - patches_per_image.append(image_pixel_values.shape[0]) - # We need to flatten (B, N, P) to (B*N*P), - # so we call flatten_bn twice. + f"Got type: {type(pixel_values_flat)}") + + assert isinstance(image_num_patches, (torch.Tensor, list)) + return InternVLImagePixelInputs( type="pixel_values", data=self._validate_pixel_values( - flatten_bn(flatten_bn(pixel_values), concat=True)), - patches_per_image=patches_per_image) + flatten_bn(pixel_values_flat, concat=True)), + patches_per_image=flatten_bn(image_num_patches, + concat=True).tolist()) raise AssertionError("This line should be unreachable.") def _process_image_input( self, image_input: InternVLImageInputs, - ) -> Tuple[torch.Tensor]: + ) -> tuple[torch.Tensor, ...]: if image_input["type"] == "image_embeds": return image_input["data"] @@ -687,7 +872,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): image_embeds = image_embeds.split(image_feature_sizes) return image_embeds - def _set_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor: + def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None: if self.is_mono: self.visual_token_mask = ( input_ids == self.img_context_token_id).reshape(-1, 1) diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 8c81dff6b57687d6e9da7ca9b97f7b74a8b750ea..72bcef5e2282f9324130c99fc6fb33999699ebd6 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 890b5530b97d68d27035e44bda314dfc62c3ccca..d82c0815213bcc6cb191595eee1cea4dfc78dfd6 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Inference-only Jamba model.""" from typing import Iterable, List, Optional, Set, Tuple diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 46235de97e36e34730883e99a79cfe3948fe9b8b..25aa0384f4d7025f1a7b9fd2b85b99c3a033a81d 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 296af2aac5660b7652ce7f0cd2e04af6241c78db..b1fee3eeb542f0f6748b72691377d2e139c49450 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import abstractmethod from functools import cached_property from typing import (Final, Iterable, List, Literal, Mapping, Optional, @@ -73,19 +75,20 @@ class LlavaMultiModalProjector(nn.Module): vision_hidden_size: int, text_hidden_size: int, projector_hidden_act: str, + multimodal_projector_bias: bool, quant_config: Optional[QuantizationConfig] = None, prefix: str = ""): super().__init__() self.linear_1 = ColumnParallelLinear(vision_hidden_size, text_hidden_size, - bias=True, + bias=multimodal_projector_bias, quant_config=quant_config, prefix=f"{prefix}.linear_1") self.act = get_act_fn(projector_hidden_act) self.linear_2 = RowParallelLinear(text_hidden_size, text_hidden_size, - bias=True, + bias=multimodal_projector_bias, quant_config=quant_config, prefix=f"{prefix}.linear_2") @@ -122,7 +125,11 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return {"image": self.get_max_image_tokens()} def _apply_feature_select_strategy( @@ -286,16 +293,29 @@ class PixtralHFMultiModalProcessor( pixel_values = processed_outputs.get("pixel_values") if pixel_values is not None: - images = mm_data["images"] - assert isinstance(images, list) - - # Original output: (1, num_images, C, H, W) - # New output: (num_images, C, H, W) - assert (isinstance(pixel_values, list) and len(pixel_values) == 1) - assert (isinstance(pixel_values[0], list) - and len(pixel_values[0]) == len(images)) + # Before/after https://github.com/huggingface/transformers/pull/35122 + if Version(TRANSFORMERS_VERSION) <= Version("4.48.2"): + images = mm_data["images"] + assert isinstance(images, list) + + # Original output: (1, num_images, C, H, W) + # New output: (num_images, C, H, W) + assert (isinstance(pixel_values, list) + and len(pixel_values) == 1) + assert (isinstance(pixel_values[0], list) + and len(pixel_values[0]) == len(images)) + + processed_outputs["pixel_values"] = pixel_values[0] + else: + # Avoid padding since we need the output for each image to be + # independent of other images for the cache to work correctly + image_sizes = processed_outputs["image_sizes"] + assert len(pixel_values) == len(image_sizes) - processed_outputs["pixel_values"] = pixel_values[0] + processed_outputs["pixel_values"] = [ + p[:, :h, :w] + for p, (h, w) in zip(pixel_values, image_sizes) + ] return processed_outputs @@ -501,6 +521,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=config.multimodal_projector_bias, quant_config=quant_config, prefix=maybe_prefix(prefix, "multi_modal_projector")) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index fda4f22d366b10d6c5c0778e913ddef782d154bb..719916642f25c5067d0c1bd2b5a727cd5cbd4fa0 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import abstractmethod from functools import cached_property from typing import (Final, Iterable, List, Literal, Mapping, Optional, @@ -71,7 +73,15 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo): return self.ctx.get_hf_config(LlavaNextConfig) def get_hf_processor(self): - return self.ctx.get_hf_processor(LlavaNextProcessor) + hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor) + + # In case patch_size is omitted from `processor_config.json` + # e.g. for E5-V: https://huggingface.co/royokong/e5-v + if hf_processor.patch_size is None: + patch_size = self.get_vision_encoder_info().get_patch_size() + hf_processor.patch_size = patch_size + + return hf_processor # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113 def get_num_image_tokens( @@ -229,7 +239,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=vision_hidden_size, text_hidden_size=config.text_config.hidden_size, - projector_hidden_act=config.projector_hidden_act) + projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=config.multimodal_projector_bias) self.language_model = init_vllm_registered_model( vllm_config=vllm_config, diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 5be85d7c0f0338447699fb37f04f518c92bf0182..817edcef4ba14af881b7e88c15d5cccfda428a1d 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, @@ -60,7 +62,11 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"video": 1} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: target_width, target_height = self.get_image_size_with_most_features() max_video_tokens = self.get_num_video_tokens( @@ -251,16 +257,16 @@ class LlavaNextVideoPooler(nn.Module): class LlavaNextMultiModalProjector(nn.Module): def __init__(self, vision_hidden_size: int, text_hidden_size: int, - projector_hidden_act: str): + projector_hidden_act: str, multimodal_projector_bias: bool): super().__init__() self.linear_1 = nn.Linear(vision_hidden_size, text_hidden_size, - bias=True) + bias=multimodal_projector_bias) self.act = get_act_fn(projector_hidden_act) self.linear_2 = nn.Linear(text_hidden_size, text_hidden_size, - bias=True) + bias=multimodal_projector_bias) def forward(self, image_features: torch.Tensor) -> torch.Tensor: hidden_states = self.linear_1(image_features) @@ -296,7 +302,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, self.multi_modal_projector = LlavaNextMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, - projector_hidden_act=config.projector_hidden_act) + projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=config.multimodal_projector_bias) self.language_model = init_vllm_registered_model( vllm_config=vllm_config, hf_config=config.text_config, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 5b0f35b08646b7daca8d60758bfef17eae35c929..2889426283f84a38c9318df1a06c6c4c57806596 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from functools import cached_property from typing import (Final, Iterable, List, Literal, Mapping, Optional, @@ -101,7 +103,11 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return { "image": self.get_max_image_tokens(), "video": self.get_max_video_tokens(seq_len), @@ -370,11 +376,11 @@ class LlavaOnevisionMultiModalProjector(nn.Module): self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, - bias=True) + bias=config.multimodal_projector_bias) self.act = get_act_fn(config.projector_hidden_act) self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, - bias=True) + bias=config.multimodal_projector_bias) def forward(self, image_features: torch.Tensor) -> torch.Tensor: hidden_states = self.linear_1(image_features) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 553bc9c28cb2191f47fa6a0ffee4fce3ec950b96..5034b334564e820dbc9eaec18374db258381c7af 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """PyTorch MAMBA model.""" from typing import Iterable, List, Optional, Set, Tuple diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py index 79393421f3ae90022d2e4ad0083f920315cd1895..353177f784b2ed6eef37c2f85566f571295b5f7b 100644 --- a/vllm/model_executor/models/mamba_cache.py +++ b/vllm/model_executor/models/mamba_cache.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Dict, List diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index 9f86b7a46981e94fd75a96b8a753f81c4b96f2ff..25fffd405f8dca5cc127890fd084ce8efe88864f 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Iterable, List, Optional, Set, Tuple, Any, Dict diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 6254d26c7060db4f1a611752fdfa415140c84fa1..29473f5bbaa0adff1b85a29256360930fcaf801b 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 5e1e6c6fa614166a6253910516df6820f0192d15..878f0c895c34b216741209d72d801dfef56678a3 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2024 The ModelBest team. diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index eb4282d62005a95015b1207246dc911d402957a1..ab697fb8cc64568f906e95d319c93f5a62c6b9f0 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -21,7 +23,6 @@ # limitations under the License. """Inference-only MiniCPM-O model compatible with HuggingFace weights.""" from functools import partial -from itertools import accumulate from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -136,11 +137,15 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None, "audio": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return { "image": self.get_max_image_tokens(), "audio": self.get_max_audio_tokens(), - "video": self.get_max_video_tokens(seq_len) + "video": self.get_max_video_tokens(seq_len), } def get_default_audio_pool_step(self) -> int: @@ -367,23 +372,18 @@ class MiniCPMOMultiModalProcessor( hf_inputs, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: + audio_num_slices = hf_inputs.get("audio_num_slices", torch.empty(0)) - def get_slices(num_slices: List[int]) -> List[int]: - slice_indices = [0] + list(accumulate(num_slices)) - slices = [(slice_indices[i], slice_indices[i + 1]) - for i in range(len(num_slices))] - return [slice(*slice_item) for slice_item in slices] - - audio_slices = get_slices( - hf_inputs.get("audio_num_slices", torch.empty(0))) return dict( **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs), - audio_features=MultiModalFieldConfig.flat("audio", audio_slices), - audio_feature_lens=MultiModalFieldConfig.flat( - "audio", audio_slices), + audio_features=MultiModalFieldConfig.flat_from_sizes( + "audio", audio_num_slices), + audio_feature_lens=MultiModalFieldConfig.flat_from_sizes( + "audio", audio_num_slices), audio_num_slices=MultiModalFieldConfig.batched("audio"), audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"), - audio_embeds=MultiModalFieldConfig.flat("audio", audio_slices)) + audio_embeds=MultiModalFieldConfig.flat_from_sizes( + "audio", audio_num_slices)) class MultiModalProjector(nn.Module): diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index bf967d33a3176b43355aac27f7f9fc574d377490..58a4448d436aa5e5c37963af738bb13f3c41d9fc 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -24,7 +26,6 @@ import math import re from collections import Counter from functools import cached_property, partial -from itertools import accumulate from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -341,6 +342,15 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): **kwargs: object, ): hf_processor = self.ctx.get_hf_processor() + + # NumPy arrays are considered as Iterable but not Sequence in + # https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428 + image_processor = hf_processor.image_processor # type: ignore + for attr in ("mean", "std"): + val = getattr(image_processor, attr) + if isinstance(val, np.ndarray): + setattr(image_processor, attr, val.tolist()) + return hf_processor def get_image_processor(self): @@ -363,7 +373,11 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): else: return {"image": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: mm_max_tokens = {"image": self.get_max_image_tokens()} if self.get_model_version() == (2, 6): mm_max_tokens["video"] = self.get_max_video_tokens(seq_len) @@ -759,30 +773,25 @@ class MiniCPMVMultiModalProcessor( hf_inputs, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - - def get_slices(num_slices: List[int]) -> List[int]: - slice_indices = [0] + list(accumulate(num_slices)) - slices = [(slice_indices[i], slice_indices[i + 1]) - for i in range(len(num_slices))] - return [slice(*slice_item) for slice_item in slices] - - image_slices = get_slices( - hf_inputs.get("image_num_slices", torch.empty(0))) - video_slices = get_slices( - hf_inputs.get("video_num_slices", torch.empty(0))) - - return dict( - pixel_values=MultiModalFieldConfig.flat("image", image_slices), - image_sizes=MultiModalFieldConfig.batched("image"), - tgt_sizes=MultiModalFieldConfig.flat("image", image_slices), - image_num_slices=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.flat("image", image_slices), - video_pixel_values=MultiModalFieldConfig.flat( - "video", video_slices), - video_image_sizes=MultiModalFieldConfig.batched("video"), - video_tgt_sizes=MultiModalFieldConfig.flat("video", video_slices), - video_embeds=MultiModalFieldConfig.flat("video", video_slices), - video_num_slices=MultiModalFieldConfig.batched("video")) + image_num_slices = hf_inputs.get("image_num_slices", torch.empty(0)) + video_num_slices = hf_inputs.get("video_num_slices", torch.empty(0)) + + return dict(pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", image_num_slices), + image_sizes=MultiModalFieldConfig.batched("image"), + tgt_sizes=MultiModalFieldConfig.flat_from_sizes( + "image", image_num_slices), + image_num_slices=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.flat_from_sizes( + "image", image_num_slices), + video_pixel_values=MultiModalFieldConfig.flat_from_sizes( + "video", video_num_slices), + video_image_sizes=MultiModalFieldConfig.batched("video"), + video_tgt_sizes=MultiModalFieldConfig.flat_from_sizes( + "video", video_num_slices), + video_embeds=MultiModalFieldConfig.flat_from_sizes( + "video", video_num_slices), + video_num_slices=MultiModalFieldConfig.batched("video")) def apply( self, @@ -1473,6 +1482,7 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA): """ # Ensure that the LoRA support check passes when the class is not # initialized, but set all these attributes to empty. + # These will be updated when an instance class is selected packed_modules_mapping = {} supported_lora_modules = [] embedding_modules = {} @@ -1489,8 +1499,15 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA): version = str(config.version).split(".") version = tuple([int(x) for x in version]) # Dispatch class based on version - instance_class = _SUPPORT_VERSION.get(version) - if instance_class is None: + instance_cls = _SUPPORT_VERSION.get(version) + if instance_cls is None: raise ValueError( "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6") - return instance_class(vllm_config=vllm_config, prefix=prefix) + + # quant_config references base class members, + # so update values before init is called + cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping) + cls.supported_lora_modules += instance_cls.supported_lora_modules + cls.embedding_modules.update(instance_cls.embedding_modules) + cls.embedding_padding_modules += instance_cls.embedding_padding_modules + return instance_cls(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 096ee4831f868c086af3c510debd69aed06353cd..0c143c9d8357c8cad71f8132900d6604029e2d7a 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 7a9b8cd88cfd021c54532ea3821f448da71eb55c..fdc438917542384278a701788f1728b5023734c7 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index f7f9d7a186d95a297f334447e59e72a1593a94c7..d1cb04cdb242f43d12448998a505ffdd8513dbec 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index cda1f22bc85c284279ad22951830062607e236f0..4009cee6eb68fcf9d982bdedcf8037e3054fdc61 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import math from typing import Iterable, List, Set, Tuple, Optional @@ -66,7 +68,7 @@ class MLPSpeculator(nn.Module): https://arxiv.org/pdf/2404.19124 Trained speculators of this type are available on HF hub at: - https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite + https://huggingface.co/ibm-ai-platform and https://huggingface.co/ibm-granite """ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py index a9102a6073a2ff81ea83fb4f3ba790d950c3f644..23814e6322d2e5d0748b7b6ff6ae7f92d73f8ce4 100644 --- a/vllm/model_executor/models/module_mapping.py +++ b/vllm/model_executor/models/module_mapping.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 5c7ae0deefcd8d1730d19bac6fd665af4b6c1b1e..b524a14977b16cae92d72e08d561ad541f794ebe 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math import re from array import array diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 1235816413a44d5b69d188d4a7fbef56436f32f5..676c960623edf3f3b3419c3b27048fa0ac8310d8 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math from typing import Iterable, List, Optional, Set, Tuple, Union diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 2340283b69665dd909a2e19dce593127598831d7..6f0b831ac272732c7ef8df2e0ead1e4d0e3fb602 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index df4fd0a3256e99d9467581ac00f9aeea95ab0d55..9c674ab4644639a6ce21a07a2b1a8377fd72fbfe 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -1,47 +1,195 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py # -------------------------------------------------------- # NVLM-D # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -from typing import Optional +from typing import Mapping, Optional +import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.inputs import INPUT_REGISTRY from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalKwargs +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + MultiModalDataItems) +from vllm.multimodal.processing import (PromptReplacement, + PromptReplacementDetails) +from vllm.multimodal.profiling import ProcessorInputs from .intern_vit import InternVisionModel -from .internvl import (InternVLChatModel, InternVLInputPipeline, - get_max_internvl_image_tokens) +from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor, + InternVLChatModel, InternVLDummyInputsBuilder, + InternVLMultiModalProcessor) + +IMG_PAD = "<|vision_pad|>" -IMG_START = '<|vision_start|>' -IMG_END = '<|vision_end|>' -IMG_CONTEXT = '<|vision_pad|>' +class NVLMProcessor(BaseInternVLProcessor): -class NVLMInputPipeline(InternVLInputPipeline): + @property + def image_token_id(self) -> int: + return self.tokenizer.get_vocab()[IMG_PAD] + + def get_image_repl_features( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + if num_patches is None: + raise NotImplementedError("Embedding inputs are not supported") + + tile_pos_identifiers = [f"" for i in range(1, num_patches)] + if self.use_thumbnail and num_patches != 1: + tile_pos_identifiers += [""] - def _create_image_prompt(self, feature_size: int, num_patches: int) -> str: - tile_pos_identifiers = ([f"" - for i in range(1, num_patches)] + - [""]) context_size = feature_size // num_patches + features = "".join(identifier + IMG_PAD * context_size + for identifier in tile_pos_identifiers) + + # We include the start and end as well because "<", "tile"], resulting in assertion error + # when trying to find "" + features + "" - return '' + ''.join( - tile_pos_identifier + self.img_context_token * context_size - for tile_pos_identifier in tile_pos_identifiers) + '' + def get_image_repl_full( + self, + feature_size: int, + num_patches: Optional[int], + ) -> str: + return self.get_image_repl_features(feature_size, num_patches) -input_pipeline = NVLMInputPipeline(IMG_START, IMG_END, IMG_CONTEXT) +class NVLMProcessingInfo(BaseInternVLProcessingInfo): + + def get_hf_processor( + self, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, + ) -> NVLMProcessor: + return NVLMProcessor( + self.get_hf_config(), + self.get_tokenizer(), + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + + def get_max_image_tokens(self) -> int: + hf_processor = self.get_hf_processor() + tokenizer = hf_processor.tokenizer + + max_num_patches = hf_processor.max_dynamic_patch + # we need +1 here because max_dynamic_patch in config doesn't + # include the thumbnail patch + tile_pos_identifiers = [ + f"" for i in range(max_num_patches) + ] + if hf_processor.use_thumbnail and max_num_patches != 1: + tile_pos_identifiers += [""] + + # "<", "tile"] + # so we include in the start_str + start_str = "" + tile_pos_identifiers.pop(0) + end_str = "" + start_token_len = len(tokenizer.encode(start_str)) + end_token_len = len(tokenizer.encode(end_str)) + tile_token_len = sum( + len(tokenizer.encode(identifier)) + for identifier in tile_pos_identifiers) + non_image_tokens_num = start_token_len + end_token_len + tile_token_len + return super().get_max_image_tokens() + non_image_tokens_num + + +class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + target_width, target_height = \ + self.info.get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + # The newline is necessary to separate ">" of the current item + # and "<" of the next item + prompt_text="\n" * num_images, + mm_data=mm_data, + ) + + +class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]): + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + if "image_num_patches" in out_mm_kwargs: + image_num_patches = out_mm_kwargs["image_num_patches"] + assert isinstance(image_num_patches, torch.Tensor) + image_num_patches = image_num_patches.tolist() + elif "image_embeds" in out_mm_kwargs: + # TODO: Use image size information in dictionary embedding inputs + # to compute num_patches (similar to Qwen2-VL) + image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) + else: + image_num_patches = [] + + def get_replacement_nvlm(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + feature_size = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + feature_size = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor, + ) + + num_patches = image_num_patches[item_idx] + if num_patches is not None: + assert isinstance(num_patches, int) + + return PromptReplacementDetails( + full=hf_processor.get_image_repl_full(feature_size, + num_patches) + "\n", + features=hf_processor.get_image_repl_features( + feature_size, num_patches) + "\n", + ) + + # See note in dummy data regarding why we have the extra newline + return [ + PromptReplacement( + modality="image", + target="\n", + replacement=get_replacement_nvlm, + ) + ] -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) -@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data) -@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) +@MULTIMODAL_REGISTRY.register_processor(NVLMMultiModalProcessor, + info=NVLMProcessingInfo, + dummy_inputs=NVLMDummyInputsBuilder) class NVLM_D_Model(InternVLChatModel): def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 538e31ec91699ec3a580efa7fdbb465918f292c0..3b470dfdd05be655f9058dcda830780c07d4a581 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py # Copyright 2024 The vLLM team. diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index a35c911f90d96b13d98d4f3846e1c33aa43e6734..4b0455098eedb4d06223b5dadf0cd3a1632f64a2 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py # Copyright 2024 The vLLM team. diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index fbe5d1aee04b3cd35fff6894b5ddbbc341fdc0c0..d6e24c6d67f37ddc361cffa8d1415d35a926598e 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index ea1185aa80dc6f3bea135c70a9acd23badd1f26f..ad1d66902435bbf13eabdc4131235ae79a6436be 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index a3757b5c8808e86763f58e15869ad8fd94a1b07d..f4f5cdff64372fada27bdce7cd36662f6ebc4cdf 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py # Copyright (c) OrionStar Inc. diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 5a28b1ffbb7b41aa48b745ed12ccaec59f8782d9..65d810dc23bc675160efe911d99ae0f1d23e7fce 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 14dd4b5b1b4dafba693bbd11fa972a59e5667df5..6a80bea348ea0c1fb112c197507e0517d2389cde 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py # Copyright 2023 The vLLM team. # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 59b7508a370f8b0d7ca34376ceb999f07c1a4194..6b05bfee949222bacd90c5eef9e6919880163cde 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 34141511ea791389aec31d5a7bd0323773582528..8f84e0726951d063f96b621d778fb9afeb25fce5 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from llama.py """Inference-only Phi3 model code inherit from Llama.py""" diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index f47676b934e4e6d5598602e3cdad362a35e9d40e..873e9d37771da0d74c9612341fb477867f4a9a02 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from typing import Iterable, List, Optional, Set, Tuple, Union @@ -474,6 +476,8 @@ class Phi3SmallForCausalLM(nn.Module, SupportsPP): continue if is_pp_missing_parameter(name, self): continue + if "lm_head.weight" in name and self.config.tie_word_embeddings: + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 0fcda81da2800ab4d6010d9c98172d069d337245..053390c521fc2e9dbb267b397ee7a217512838c1 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 The vLLM team. # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. # @@ -320,7 +322,11 @@ class Phi3VProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: target_width, target_height = self.get_image_size_with_most_features() max_image_tokens = self.get_num_image_tokens( diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 6367b770a0affcf54bcea10991fa12177f49a2da..aa4bb52c444f765004644c70cdd2819457b57c93 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 37b9989e489ecdef64d027b3c52405d9dfc373f0..003e9c84c1c0ae3d1e36f8cba962e9f37a719c87 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from dataclasses import dataclass, fields from functools import cached_property diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index a9b32b8ff74467454ed17dffae3a32f1979bf180..b6517d9888bccd1c6cca7200c0aaaa356ae6eaf3 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py # Copyright (c) Alibaba Cloud. @@ -789,7 +791,11 @@ class QWenVLProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return {"image": self.get_num_image_tokens()} def get_num_image_tokens(self) -> int: @@ -809,13 +815,13 @@ class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]): vision_config = hf_config.visual - max_image_size = vision_config["image_size"] + target_width = target_height = vision_config["image_size"] num_images = mm_counts.get("image", 0) mm_data = { "image": - self._get_dummy_images(width=max_image_size, - height=max_image_size, + self._get_dummy_images(width=target_width, + height=target_height, num_images=num_images) } @@ -1273,6 +1279,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA): """ # Ensure that the LoRA support check passes when the class is not # initialized, but set all these attributes to empty. + # These will be updated when an instance class is selected packed_modules_mapping = {} supported_lora_modules = [] embedding_modules = {} @@ -1284,9 +1291,18 @@ class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA): prefix: str = "", ) -> QWenBaseModel: config = vllm_config.model_config.hf_config + # Initialize VL - if hasattr(config, "visual"): - return QWenVL(vllm_config=vllm_config, prefix=prefix) + if hasattr(config, "visual"): # noqa: SIM108 + instance_cls = QWenVL # Initialize LLM else: - return QWenLLM(vllm_config=vllm_config, prefix=prefix) + instance_cls = QWenLLM + + # quant_config references base class members, + # so update values before init is called + cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping) + cls.supported_lora_modules += instance_cls.supported_lora_modules + cls.embedding_modules.update(instance_cls.embedding_modules) + cls.embedding_padding_modules += instance_cls.embedding_padding_modules + return instance_cls(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 59491ad008243e684cdaf96a5943e57ded59a0f2..f2a5ea6b8148f0c571a3c1a9e6289baefe2bd92c 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py # Copyright 2024 The Qwen team. diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py new file mode 100644 index 0000000000000000000000000000000000000000..e93cf46b900b6facc88cb0f19ae928a17955efec --- /dev/null +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -0,0 +1,1133 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +# Copyright 2025 The vLLM team. +# Copyright 2025 The Qwen Team. +# Copyright 2025 The HuggingFace Inc. team. +# All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" +from functools import cached_property, partial +from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +from transformers import BatchFeature +from transformers.models.qwen2_5_vl import (Qwen2_5_VLImageProcessor, + Qwen2_5_VLProcessor) +from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( + Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.distributed import parallel_state +from vllm.distributed import utils as dist_utils +from vllm.logger import init_logger +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.gptq import GPTQConfig +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig) +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalFieldConfig +from vllm.platforms import _Backend +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import uses_mrope + +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP +from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder +from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo, + apply_rotary_pos_emb_vision) +from .utils import (AutoWeightsLoader, WeightsMapper, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) +from .vision import get_vit_attn_backend + +logger = init_logger(__name__) + +# === Vision Inputs === # + + +class Qwen2_5_VLImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + pixel_values: torch.Tensor + """Shape: + `(num_patches, num_channels * patch_size * patch_size)` + """ + + image_grid_thw: torch.Tensor + """Shape: `(num_images, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +class Qwen2_5_VLImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + image_embeds: torch.Tensor + """Supported types: + - List[`torch.Tensor`]: A list of tensors holding all images' features. + Each tensor holds an image's features. + - `torch.Tensor`: A tensor holding all images' features + (concatenation of all images' feature tensors). + + Tensor shape: `(num_image_features, hidden_size)` + - `num_image_features` varies based on + the number and resolution of the images. + - `hidden_size` must match the hidden size of language model backbone. + """ + + image_grid_thw: torch.Tensor + """Shape: `(num_images, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +Qwen2_5_VLImageInputs = Union[Qwen2_5_VLImagePixelInputs, + Qwen2_5_VLImageEmbeddingInputs] + + +class Qwen2_5_VLVideoPixelInputs(TypedDict): + type: Literal["pixel_values_videos"] + pixel_values_videos: torch.Tensor + """Shape: + `(num_patches, + num_channels * temporal_patch_size * patch_size * patch_size)` + """ + + video_grid_thw: torch.Tensor + """Shape: `(num_videos, 3)` + + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + second_per_grid_ts: torch.Tensor + """ + The video time interval (in seconds) for each grid along the temporal + dimension in the 3D position IDs. Returned when `videos` is not `None`. + """ + + +class Qwen2_5_VLVideoEmbeddingInputs(TypedDict): + type: Literal["video_embeds"] + video_embeds: torch.Tensor + """Supported types: + - List[`torch.Tensor`]: A list of tensors holding all videos' features. + Each tensor holds an video's features. + - `torch.Tensor`: A tensor holding all videos' features + (concatenation of all videos' feature tensors). + + Tensor shape: `(num_image_features, hidden_size)` + - `num_image_features` varies based on + the number and resolution of the videos. + - `hidden_size` must match the hidden size of language model backbone. + """ + + video_grid_thw: torch.Tensor + """Shape: `(num_videos, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +Qwen2_5_VLVideoInputs = Union[Qwen2_5_VLVideoPixelInputs, + Qwen2_5_VLVideoEmbeddingInputs] + +# === Vision Encoder === # + + +class Qwen2_5_VisionMLP(nn.Module): + + def __init__(self, + in_features: int, + hidden_features: int, + bias: bool = False, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + self.gate_proj = ColumnParallelLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.gate_proj") + self.up_proj = ColumnParallelLinear(in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.up_proj") + self.down_proj = RowParallelLinear(hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") + self.act_fn = act_fn + + def forward(self, x: torch.Tensor): + x_gate, _ = self.gate_proj(x) + x_gate = self.act_fn(x_gate) + x_up, _ = self.up_proj(x) + x_down, _ = self.down_proj(x_gate * x_up) + return x_down + + +class Qwen2_5_VisionAttention(nn.Module): + + def __init__( + self, + embed_dim: int, + num_heads: int, + projection_size: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + # Per attention head and per partition values. + world_size = parallel_state.get_tensor_model_parallel_world_size() + self.hidden_size_per_attention_head = dist_utils.divide( + projection_size, num_heads) + self.num_attention_heads_per_partition = dist_utils.divide( + num_heads, world_size) + + self.qkv = ColumnParallelLinear(input_size=embed_dim, + output_size=3 * projection_size, + quant_config=quant_config, + prefix=f"{prefix}.qkv") + self.proj = RowParallelLinear(input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj") + + # Detect attention implementation. + self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) + if self.attn_backend not in { + _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS + }: + raise RuntimeError( + f"Qwen2.5-VL does not support {self.attn_backend} backend now." + ) + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor, + ) -> torch.Tensor: + # [s, b, c] --> [s, b, head * 3 * head_dim] + x, _ = self.qkv(x) + + # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim] + new_x_shape = x.size()[:-1] + ( + self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head, + ) + x = x.view(*new_x_shape) + + # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim] + q, k, v = dist_utils.split_tensor_along_last_dim(x, 3) + batch_size = q.shape[1] + + q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() + for x in (q, k, v)) + if rotary_pos_emb is not None: + q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) + k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + + if self.attn_backend == _Backend.FLASH_ATTN: + # from vllm_flash_attn.flash_attn_interface import ( + # flash_attn_varlen_func) + from flash_attn import flash_attn_varlen_func + + q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) + + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + output = flash_attn_varlen_func(q, + k, + v, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, + dropout_p=0, + causal=False) + + context_layer = rearrange(output, + "(b s) ... -> b s ...", + b=batch_size) + elif self.attn_backend == _Backend.TORCH_SDPA: + seq_length = q.size(1) + q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v]) + attention_mask = torch.zeros([1, seq_length, seq_length], + device=q.device, + dtype=torch.bool) + for i in range(1, len(cu_seqlens)): + attention_mask[..., cu_seqlens[i - 1]:cu_seqlens[i], + cu_seqlens[i - 1]:cu_seqlens[i]] = True + output = F.scaled_dot_product_attention(q, + k, + v, + attention_mask, + dropout_p=0.0) + context_layer = rearrange(output, "b h s d -> b s h d ") + elif self.attn_backend == _Backend.XFORMERS: + from xformers import ops as xops + from xformers.ops.fmha.attn_bias import BlockDiagonalMask + + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, + kv_seqlen=None) + + context_layer = xops.memory_efficient_attention_forward( + q, k, v, attn_bias=attn_bias, p=0, scale=None) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() + + output, _ = self.proj(context_layer) + return output + + +class Qwen2RMSNorm(nn.Module): + + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class Qwen2_5_VisionBlock(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int, + mlp_hidden_dim: int, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.norm1 = norm_layer(dim) + self.norm2 = norm_layer(dim) + self.attn = Qwen2_5_VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn") + self.mlp = Qwen2_5_VisionMLP(dim, + mlp_hidden_dim, + act_fn=act_fn, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + + def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor) -> torch.Tensor: + x = x + self.attn(self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb) + x = x + self.mlp(self.norm2(x)) + return x + + +class Qwen2_5_VisionPatchEmbed(nn.Module): + + def __init__( + self, + patch_size: int = 14, + temporal_patch_size: int = 2, + in_channels: int = 3, + hidden_size: int = 1152, + ) -> None: + super().__init__() + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.hidden_size = hidden_size + + kernel_size = (temporal_patch_size, patch_size, patch_size) + self.proj = nn.Conv3d(in_channels, + hidden_size, + kernel_size=kernel_size, + stride=kernel_size, + bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + L, C = x.shape + x = x.view(L, -1, self.temporal_patch_size, self.patch_size, + self.patch_size) + x = self.proj(x).view(L, self.hidden_size) + return x + + +class Qwen2_5_VisionPatchMerger(nn.Module): + + def __init__( + self, + d_model: int, + context_dim: int, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + spatial_merge_size: int = 2, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size**2) + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.ln_q = norm_layer(context_dim) + self.mlp = nn.ModuleList([ + ColumnParallelLinear(self.hidden_size, + self.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp.0"), + nn.GELU(), + RowParallelLinear(self.hidden_size, + d_model, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp.2"), + ]) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.ln_q(x) + x = x.view(-1, self.hidden_size) + + mlp_fc1, mlp_act, mlp_fc2 = self.mlp + x_parallel, _ = mlp_fc1(x) + x_parallel = mlp_act(x_parallel) + out, _ = mlp_fc2(x_parallel) + return out + + +class Qwen2_5_VisionRotaryEmbedding(nn.Module): + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + self.dim = dim + self.theta = theta + inv_freq = 1.0 / (theta + **(torch.arange(0, dim, 2, dtype=torch.float) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._freqs_cached = None + + def update_freqs_cache(self, seqlen: int) -> None: + if seqlen > self._seq_len_cached: + seqlen *= 2 + self._seq_len_cached = seqlen + self.inv_freq = 1.0 / (self.theta**(torch.arange( + 0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device) + / self.dim)) + seq = torch.arange(seqlen, + device=self.inv_freq.device, + dtype=self.inv_freq.dtype) + freqs = torch.outer(seq, self.inv_freq) + self._freqs_cached = freqs + + def forward(self, seqlen: int) -> torch.Tensor: + self.update_freqs_cache(seqlen) + return self._freqs_cached[:seqlen] + + +class Qwen2_5_VisionTransformer(nn.Module): + + def __init__( + self, + vision_config: Qwen2_5_VLVisionConfig, + norm_eps: float = 1e-6, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + + patch_size = vision_config.patch_size + temporal_patch_size = vision_config.temporal_patch_size + in_channels = vision_config.in_channels + depth = vision_config.depth + self.hidden_size = vision_config.hidden_size + self.num_heads = vision_config.num_heads + + # args for get_window_index + self.window_size = vision_config.window_size + self.patch_size = vision_config.patch_size + self.spatial_merge_size = vision_config.spatial_merge_size + self.fullatt_block_indexes = vision_config.fullatt_block_indexes + self.spatial_merge_unit = self.spatial_merge_size**2 + + self.patch_embed = Qwen2_5_VisionPatchEmbed( + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + in_channels=in_channels, + hidden_size=self.hidden_size, + ) + + # NOTE: We use torch native RMSNorm here for precision purposes. + norm_layer = partial(Qwen2RMSNorm, eps=norm_eps) + head_dim = self.hidden_size // self.num_heads + self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList([ + Qwen2_5_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") + for layer_idx in range(depth) + ]) + self.merger = Qwen2_5_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + norm_layer=norm_layer, + spatial_merge_size=self.spatial_merge_size, + quant_config=quant_config, + prefix=f"{prefix}.merger", + ) + + @property + def dtype(self) -> torch.dtype: + return self.patch_embed.proj.weight.dtype + + @property + def device(self) -> torch.device: + return self.patch_embed.proj.weight.device + + def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + pos_ids.append( + torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def get_window_index(self, grid_thw): + window_index: list = [] + cu_window_seqlens: list = [0] + window_index_id = 0 + vit_merger_window_size = (self.window_size // + self.spatial_merge_size // self.patch_size) + + for grid_t, grid_h, grid_w in grid_thw: + llm_grid_h = grid_h // self.spatial_merge_size + llm_grid_w = grid_w // self.spatial_merge_size + index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( + grid_t, llm_grid_h, llm_grid_w) + pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size + pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size + num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size + num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size + index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) + index_padded = index_padded.reshape(grid_t, num_windows_h, + vit_merger_window_size, + num_windows_w, + vit_merger_window_size) + index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( + grid_t, num_windows_h * num_windows_w, vit_merger_window_size, + vit_merger_window_size) + seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) + index_padded = index_padded.reshape(-1) + index_new = index_padded[index_padded != -100] + window_index.append(index_new + window_index_id) + cu_seqlens_tmp = seqlens.cumsum( + 0) * self.spatial_merge_unit + cu_window_seqlens[-1] + cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) + window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() + window_index = torch.cat(window_index, dim=0) + return window_index, cu_window_seqlens + + def forward( + self, + x: torch.Tensor, + grid_thw: torch.Tensor, + ) -> torch.Tensor: + # patchify + hidden_states = x.to(device=self.device, dtype=self.dtype) + hidden_states = self.patch_embed(hidden_states) + + # compute position embedding + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + # windows attention + window_index, cu_window_seqlens = self.get_window_index(grid_thw) + cu_window_seqlens = torch.tensor( + cu_window_seqlens, + device=hidden_states.device, + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) + cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) + seq_len, _ = hidden_states.size() + hidden_states = hidden_states.reshape( + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) + hidden_states = hidden_states[window_index, :, :] + hidden_states = hidden_states.reshape(seq_len, -1) + rotary_pos_emb = rotary_pos_emb.reshape( + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) + rotary_pos_emb = rotary_pos_emb[window_index, :, :] + rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) + # compute cu_seqlens + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], + grid_thw[:, 0]).cumsum( + dim=0, dtype=torch.int32) + cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) + + # transformers + hidden_states = hidden_states.unsqueeze(1) + for layer_num, blk in enumerate(self.blocks): + if layer_num in self.fullatt_block_indexes: + cu_seqlens_now = cu_seqlens + else: + cu_seqlens_now = cu_window_seqlens + hidden_states = blk(hidden_states, + cu_seqlens=cu_seqlens_now, + rotary_pos_emb=rotary_pos_emb) + + # adapter + hidden_states = self.merger(hidden_states) + reverse_indices = torch.argsort(window_index) + hidden_states = hidden_states[reverse_indices, :] + return hidden_states + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if name.endswith("qkv.weight"): + visual_num_heads = self.num_heads + visual_embed_dim = self.hidden_size + head_size = visual_embed_dim // visual_num_heads + loaded_weight = loaded_weight.view(3, visual_num_heads, + head_size, + visual_embed_dim) + loaded_weight = loaded_weight.transpose(0, 1) + loaded_weight = loaded_weight.reshape(-1, visual_embed_dim) + elif name.endswith("qkv.bias"): + visual_num_heads = self.num_heads + visual_embed_dim = self.hidden_size + head_size = visual_embed_dim // visual_num_heads + loaded_weight = loaded_weight.view(3, visual_num_heads, + head_size) + loaded_weight = loaded_weight.transpose(0, 1) + loaded_weight = loaded_weight.reshape(-1) + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config(Qwen2_5_VLConfig) + + def get_hf_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + fps: Optional[float] = 2.0, + ) -> Qwen2_5_VLProcessor: + hf_processor = self.ctx.get_hf_processor(Qwen2_5_VLProcessor) + image_processor = hf_processor.image_processor # type: ignore + assert isinstance(image_processor, Qwen2_5_VLImageProcessor) + + if min_pixels: + image_processor.min_pixels = min_pixels + if max_pixels: + image_processor.max_pixels = max_pixels + if max_pixels or min_pixels: + image_processor.size = { + "min_pixels": image_processor.min_pixels, + "max_pixels": image_processor.max_pixels, + } + + return hf_processor + + def get_image_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + fps: Optional[float] = 2.0, + ) -> Qwen2_5_VLImageProcessor: + hf_processor = self.get_hf_processor( + min_pixels=min_pixels, + max_pixels=max_pixels, + fps=fps, + ) + image_processor = hf_processor.image_processor # type: ignore + assert isinstance(image_processor, Qwen2_5_VLImageProcessor) + return image_processor + + +class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor): + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs), + second_per_grid_ts=MultiModalFieldConfig.batched("video"), + ) + + +@MULTIMODAL_REGISTRY.register_processor( + Qwen2_5_VLMultiModalProcessor, + info=Qwen2_5_VLProcessingInfo, + dummy_inputs=Qwen2_5_VLDummyInputsBuilder) +class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ] + } + + # LoRA specific attributes, TODO: double check + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + "gate_proj" + "up_proj", + # vision tower + "qkv", + "attn.proj", # Distinguish patch_embed.proj + "fc1", + "fc2", + # projector + "mlp.0", + "mlp.2" + ] + embedding_modules = {} + embedding_padding_modules = [] + + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + self.visual = Qwen2_5_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=maybe_prefix(prefix, "visual"), + ) + + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=["Qwen2ForCausalLM"], + ) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + @cached_property + def sampler(self): + if hasattr(self.language_model, "sampler"): + return self.language_model.sampler + + return get_sampler() + + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): + # GPTQ configs do not have a list of ignored modules, however AutoGPTQ + # seems to avoid vision encoder sections for some models. + if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + return None + return quant_config + + def _validate_and_reshape_mm_tensor(self, mm_input: object, + name: str) -> torch.Tensor: + if not isinstance(mm_input, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of {name}. " + f"Got type: {type(mm_input)}") + if isinstance(mm_input, torch.Tensor): + if mm_input.ndim == 2: + return mm_input + if mm_input.ndim != 3: + raise ValueError(f"{name} should be 2D or batched 3D tensor. " + f"Got ndim: {mm_input.ndim} " + f"(shape={mm_input.shape})") + return torch.concat(list(mm_input)) + else: + return torch.concat(mm_input) + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + pixel_values = self._validate_and_reshape_mm_tensor( + pixel_values, "image pixel values") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of image pixel values. " + f"Got type: {type(pixel_values)}") + + return Qwen2_5_VLImagePixelInputs(type="pixel_values", + pixel_values=pixel_values, + image_grid_thw=image_grid_thw) + + if image_embeds is not None: + image_embeds = self._validate_and_reshape_mm_tensor( + image_embeds, "image embeds") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") + + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return Qwen2_5_VLImageEmbeddingInputs( + type="image_embeds", + image_embeds=image_embeds, + image_grid_thw=image_grid_thw) + + def _parse_and_validate_video_input( + self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]: + pixel_values_videos = kwargs.pop("pixel_values_videos", None) + video_embeds = kwargs.pop("video_embeds", None) + video_grid_thw = kwargs.pop("video_grid_thw", None) + second_per_grid_ts = kwargs.pop("second_per_grid_ts", None) + + if pixel_values_videos is None and video_embeds is None: + return None + + if pixel_values_videos is not None: + pixel_values_videos = self._validate_and_reshape_mm_tensor( + pixel_values_videos, "video pixel values") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + return Qwen2_5_VLVideoPixelInputs( + type="pixel_values_videos", + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + ) + + if video_embeds is not None: + video_embeds = self._validate_and_reshape_mm_tensor( + video_embeds, "video embeds") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + if not isinstance(video_embeds, torch.Tensor): + raise ValueError("Incorrect type of video embeddings. " + f"Got type: {type(video_embeds)}") + return Qwen2_5_VLVideoEmbeddingInputs( + type="video_embeds", + video_embeds=video_embeds, + video_grid_thw=video_grid_thw) + + def _process_image_input( + self, + image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + + return image_embeds.split(sizes.tolist()) + + def _process_video_input( + self, + video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + + if video_input["type"] == "video_embeds": + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + + return video_embeds.split(sizes.tolist()) + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) + return modalities + + def get_multimodal_embeddings( + self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: + + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: + return None + + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_video_input(video_input) + multimodal_embeddings += video_embeddings + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + [self.config.image_token_id, self.config.video_token_id]) + return inputs_embeds + + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[tuple[torch.Tensor, ...]] = None, + video_input: Optional[tuple[torch.Tensor, ...]] = None, + ) -> torch.Tensor: + + inputs_embeds = self.get_input_embeddings(input_ids) + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_id, + ) + + if video_input is not None: + video_embeds = self._process_video_input(video_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_id, + ) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + """Run forward pass for Qwen2.5-VL. + + Args: + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + positions: Flattened (concatenated) position ids corresponding to a + batch. + **NOTE**: If mrope is enabled (default setting for Qwen2.5-VL + opensource models), the shape will be `(3, seq_len)`, + otherwise it will be `(seq_len,). + pixel_values: Pixel values to be fed to a model. + `None` if no images are passed. + image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. + `None` if no images are passed. + pixel_values_videos: Pixel values of videos to be fed to a model. + `None` if no videos are passed. + video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. + `None` if no videos are passed. + second_per_grid_ts: Tensor `(num_videos)` of video time interval ( + in seconds) for each grid along the temporal dimension in the + 3D position IDs. `None` if no videos are passed. + """ + + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. + elif inputs_embeds is None: + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + + if image_input is None and video_input is None: + inputs_embeds = None + else: + if uses_mrope(self.config): + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input) + input_ids = None + + hidden_states = self.language_model.model( + input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + return self.language_model.sample(logits, sampling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="visual.", + tower_model="visual.merger.") diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 7b55b0766ea00fb8d9b9378647dcff227a5ab8d1..7bdeba04c74f6440b19fda0e046c6bcb5a90c790 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. @@ -108,7 +110,11 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: hf_config = self.get_hf_config() max_source_positions = hf_config.audio_config.max_source_positions max_output_lengths = (max_source_positions - 2) // 2 + 1 diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 34b2bd503711cd0f9ec2764f3067ab99370322aa..0990148e2bdcfdad0ca18f8824dbbae0387c0b3e 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py # Copyright 2024 The Qwen team. diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 593ce4857af0fd58471639927a2b931f6b5e572e..00e4159e28cf7b9d5995d215a647e249e9a62753 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py # Copyright 2024 The Qwen team. diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 26ca1629b6d0a4e9da39b17fc0a91fa40b3e21af..c5141fe5a45251dc5b2bd8bcb34fb86b532139a6 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py # Copyright 2024 The Qwen team. @@ -707,8 +709,8 @@ class Qwen2VisionTransformer(nn.Module): return loaded_params -class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], - dict[str, torch.Tensor]]): +class Qwen2VLEmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], + dict[str, torch.Tensor]]): def __init__(self, data: dict, modality: str) -> None: super().__init__(data, modality) @@ -740,26 +742,26 @@ class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], return self.data -class Qwen2ImageEmbeddingItems(Qwen2EmbeddingItems): +class Qwen2VLImageEmbeddingItems(Qwen2VLEmbeddingItems): def __init__(self, data: dict) -> None: super().__init__(data, "image") -class Qwen2VideoEmbeddingItems(Qwen2EmbeddingItems): +class Qwen2VLVideoEmbeddingItems(Qwen2VLEmbeddingItems): def __init__(self, data: dict) -> None: super().__init__(data, "video") -class Qwen2MultiModalDataParser(MultiModalDataParser): +class Qwen2VLMultiModalDataParser(MultiModalDataParser): def _parse_image_data( self, data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]], ) -> ModalityDataItems[Any, Any]: if isinstance(data, dict): - return Qwen2EmbeddingItems(data, modality="image") + return Qwen2VLEmbeddingItems(data, modality="image") return super()._parse_image_data(data) @@ -768,7 +770,7 @@ class Qwen2MultiModalDataParser(MultiModalDataParser): data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]], ) -> ModalityDataItems[Any, Any]: if isinstance(data, dict): - return Qwen2EmbeddingItems(data, modality="video") + return Qwen2VLEmbeddingItems(data, modality="video") return super()._parse_video_data(data) @@ -815,7 +817,11 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: return { "image": self.get_max_image_tokens(), "video": self.get_max_video_tokens(seq_len), @@ -1001,7 +1007,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] ): def _get_data_parser(self) -> MultiModalDataParser: - return Qwen2MultiModalDataParser() + return Qwen2VLMultiModalDataParser() def _get_prompt_replacements( self, @@ -1046,26 +1052,21 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) - image_slice_idxs = [0] + image_grid_thw.prod(-1).cumsum_(0).tolist() - image_slices = [ - slice(image_slice_idxs[i], image_slice_idxs[i + 1]) - for i in range(len(image_grid_thw)) - ] + image_grid_sizes = image_grid_thw.prod(-1) video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) - video_slice_idxs = [0] + video_grid_thw.prod(-1).cumsum_(0).tolist() - video_slices = [ - slice(video_slice_idxs[i], video_slice_idxs[i + 1]) - for i in range(len(video_grid_thw)) - ] + video_grid_sizes = video_grid_thw.prod(-1) return dict( - pixel_values=MultiModalFieldConfig.flat("image", image_slices), - image_embeds=MultiModalFieldConfig.flat("image", image_slices), + pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes), + image_embeds=MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes), image_grid_thw=MultiModalFieldConfig.batched("image"), - pixel_values_videos=MultiModalFieldConfig.flat( - "video", video_slices), - video_embeds=MultiModalFieldConfig.flat("video", video_slices), + pixel_values_videos=MultiModalFieldConfig.flat_from_sizes( + "video", video_grid_sizes), + video_embeds=MultiModalFieldConfig.flat_from_sizes( + "video", video_grid_sizes), video_grid_thw=MultiModalFieldConfig.batched("video"), ) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index de05bf2b772f5204a85360397669f7ba2c8e8f2d..3b2a7069efc9108b0c4eba9b938f2f85a26ab637 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """ Whenever you add an architecture to this page, please also update `tests/models/registry.py` with example HuggingFace models for it. @@ -44,7 +45,7 @@ _TEXT_GENERATION_MODELS = { "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"), - "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"), + "DeepseekV3ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"), "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"), "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"), @@ -171,6 +172,7 @@ _MULTIMODAL_MODELS = { "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501 "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 + "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501 "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501 "UltravoxModel": ("ultravox", "UltravoxModel"), # [Encoder-decoder] @@ -183,6 +185,10 @@ _SPECULATIVE_DECODING_MODELS = { "MedusaModel": ("medusa", "Medusa"), "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), } + +_FALLBACK_MODEL = { + "TransformersModel": ("transformers", "TransformersModel"), +} # yapf: enable _VLLM_MODELS = { @@ -191,6 +197,7 @@ _VLLM_MODELS = { **_CROSS_ENCODER_MODELS, **_MULTIMODAL_MODELS, **_SPECULATIVE_DECODING_MODELS, + **_FALLBACK_MODEL, } @@ -377,7 +384,12 @@ class _ModelRegistry: if not architectures: logger.warning("No model architectures are specified") - return architectures + normalized_arch = [] + for model in architectures: + if model not in self.models: + model = "TransformersModel" + normalized_arch.append(model) + return normalized_arch def inspect_model_cls( self, diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 5997a76890c9d54380a5d87ceb8df28ebd0524a9..742e63a065b18bafa773d36c737e423a3870a9ca 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from typing import Iterable, List, Optional, Tuple diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 30a5ba6523007c24be7093d04e5ce299419944a4..870c0294d1c04fe0fc37850ed2fbfc5ce9fdb590 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Implementation of SiglipVisionModel intended to be only used within a vision language model.""" diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index e6d919f23c85da5a9d02c888a32165d0c17ee1ee..6215ed814bf42f20250f339fae512c29cf8899bf 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index c9d1af78246a6896f899d6713b9c4a1ca9a4b843..a5d4432669f4c66f8553278b5b0a7b4c50a2a806 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. # All rights reserved. # diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 1cd0dedfed2cbbc69d5c2c73a31998e09c09fb28..01ea43666482acef600f320367f8ea5e3c068820 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index 02ca7fe08e5568bdc478d1a3da682e705b088cd0..a38035e37ec73469e0bc240322fa99ef07086d88 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py new file mode 100644 index 0000000000000000000000000000000000000000..43d2c88d3b9ca177b73211eb8cba715569d97c4c --- /dev/null +++ b/vllm/model_executor/models/transformers.py @@ -0,0 +1,266 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Wrapper around `transformers` models""" +import re +from typing import Iterable, Literal, Optional, Union + +import torch +from torch import nn +from transformers import AutoModel, PreTrainedModel +from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS + +from vllm.attention import Attention, AttentionMetadata +from vllm.config import VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.utils import divide +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .utils import maybe_prefix + +logger = init_logger(__name__) + + +def vllm_flash_attention_forward( + # Transformers args + module: torch.nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: torch.Tensor, + # Transformers kwargs + scaling: float = None, + # vLLM kwargs + attn_metadata: AttentionMetadata = None, + attention_instances: list[Attention] = None, + **kwargs): + self_attn = attention_instances[module.layer_idx] + if scaling is not None: + self_attn.impl.scale = float(scaling) + hidden = query.shape[-2] + query, key, value = (x.transpose(1, 2) for x in (query, key, value)) + query, key, value = (x.reshape(hidden, -1) for x in (query, key, value)) + return self_attn.forward( + query, + key, + value, + kv_cache=None, # argument not used + attn_metadata=attn_metadata), None + + +ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward + + +def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module): + logger.debug("%s: %s -> %s", name, old_module, new_module) + + +def replace_linear_class( + linear: nn.Linear, + style: Literal["colwise", "rowwise"], + quant_config=None) -> Union[ColumnParallelLinear, RowParallelLinear]: + """ + Replace nn.Linear with one of vLLM's tensor parallel linear classes. + + `quant_config` is not yet supported. + Args: + linear (nn.Linear): `nn.Linear` to be replaced. + style (str): Tensor parallel style of the new linear, e.g. "colwise". + quant_config (QuantConfig): Quantization config for the new linear. + Returns: + Union[ColumnParallelLinear, RowParallelLinear]: The new linear. + """ + + if not isinstance(style, str): + raise ValueError( + f"Unsupported parallel style type {type(style)}, expected str") + + vllm_linear_cls = { + "colwise": ColumnParallelLinear, + "rowwise": RowParallelLinear, + }.get(style) + + if vllm_linear_cls is None: + logger.warning( + "Unsupported parallel style value: %s. " + "This layer will not be tensor parallelized.", style) + return linear + + class HFCompatibleLinear(vllm_linear_cls): + """ + Wrapper class that removes `output_bias` from returned output. + """ + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return super().forward(input)[0] + + return HFCompatibleLinear( + input_size=linear.in_features, + output_size=linear.out_features, + bias=linear.bias is not None, + ) + + +class TransformersModel(nn.Module): + embedding_padding_modules = ["lm_head"] + embedding_modules = ["embed_tokens" + ] # TODO transformers will have a util to get it + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + logger.info("Using Transformers backend.") + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.config = config + self.quant_config = quant_config + self.vocab_size = config.vocab_size + self.unpadded_vocab_size = config.vocab_size + + self.model: PreTrainedModel = AutoModel.from_config( + self.config, + attn_implementation="vllm", + trust_remote_code=vllm_config.model_config.trust_remote_code, + ) + prefix = self.model.base_model_prefix + + # MLP modifications + self.apply_base_model_tp_plan(self.model) + + # Attention modifications (assumes 1 attention op per hidden layer) + tp_size = get_tensor_model_parallel_world_size() + self.attention_instances = [ + Attention( + num_heads=divide(config.num_attention_heads, tp_size), + head_size=config.head_dim, + # NOTE: We use Llama scale as default, if it's set by + # Transformers, it's updated in vllm_flash_attention_forward + scale=config.head_dim**-0.5, + num_kv_heads=divide(config.num_key_value_heads, tp_size), + cache_config=cache_config, + quant_config=None, + prefix=f"{i}.attn") for i in range(config.num_hidden_layers) + ] + + # Model modifications + self.replace_vocab_embed_class(self.model) + + # ForCausalLM modifications + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=None, + prefix=maybe_prefix(prefix, "lm_head")) + if config.tie_word_embeddings: + self.lm_head.weight = self.model.get_input_embeddings().weight + + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, logit_scale) + self.sampler = get_sampler() + + def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""): + """ + Apply the base model tensor parallelization plan to a module. + Currently only supports linear layers. + """ + if (self.config.base_model_tp_plan is None + and get_tensor_model_parallel_world_size() > 1): + raise ValueError( + "Trying to run tensor parallelization but the model does not " + "support it yet!") + + for child_name, child_module in module.named_children(): + qual_name = maybe_prefix(prefix, child_name) + for pattern, style in self.config.base_model_tp_plan.items(): + if re.match(pattern, qual_name) and isinstance( + child_module, nn.Linear): + new_module = replace_linear_class(child_module, style, + self.quant_config) + setattr(module, child_name, new_module) + log_replacement(qual_name, child_module, new_module) + else: + self.apply_base_model_tp_plan(child_module, prefix=qual_name) + + def replace_vocab_embed_class(self, module: nn.Module): + # Use native set input embeddings + new_module = VocabParallelEmbedding( + self.vocab_size, + self.config.hidden_size, + org_num_embeddings=self.config.vocab_size, + quant_config=None, + ) + log_replacement("input embedding", self.model.get_input_embeddings(), + new_module) + self.model.set_input_embeddings(new_module) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: list[torch.Tensor], # argument not used + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + model_output = self.model( + input_ids[None, ...], + use_cache=False, + position_ids=positions[None, ...], + attn_metadata=attn_metadata, + intermediate_tensors=intermediate_tensors, + attention_instances=self.attention_instances, + return_dict=False)[0][0, ...] # we remove batch dimension for now + return model_output + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample(self, logits: torch.Tensor, + sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: + + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + params_dict = dict(self.named_parameters()) + loaded_params = set[str]() + for name, loaded_weight in weights: + if name not in params_dict: + name = f"{self.model.base_model_prefix}.{name}" + if name in params_dict: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 605a0ecf4e0a9439e42ea83574c92123f742b0f1..9da0682cfa866c9359ffea2aba40d7da1dad3f57 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" import math @@ -20,6 +22,7 @@ from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import DefaultModelLoader +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, @@ -31,7 +34,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings, @@ -90,7 +93,11 @@ class UltravoxProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: feature_extractor = self.get_feature_extractor() max_audio_tokens = math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND) @@ -337,7 +344,20 @@ class ModifiedWhisperEncoder(WhisperEncoder): UltravoxMultiModalProcessor, info=UltravoxProcessingInfo, dummy_inputs=UltravoxDummyInputsBuilder) -class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): +class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): + + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] + } + + # LoRA specific attributes + # TODO : Add LoRA to the audio tower and projector. + supported_lora_modules = [ + "qkv_proj", "o_proj", "gate_up_proj", "down_proj" + ] + embedding_modules = {} + embedding_padding_modules = [] hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."}) @@ -385,6 +405,16 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): return get_sampler() + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model.", + connector="multi_modal_projector.", + tower_model="audio_tower.", + ) + def _audio_features_to_embeddings( self, input_features: torch.Tensor) -> torch.Tensor: audio_input = input_features.to(self.audio_tower.dtype) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 01a232fdc76de0956b447b3b0a35a7d0ea7e8317..fff4be34ddbeb0296e7c76ba63a0f70d00b50bc1 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import itertools from dataclasses import dataclass, field from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional, diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 57166f05cd9bf2f649e382b5e4863fe96079d280..0d67ee7bb5ddf914b7fe3c4d6b85b9250bf3ff15 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import Final, Generic, Optional, Protocol, TypeVar, Union diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 15e35fa9cd2c91ff9abc9cb62489dfe63de068d8..0a3011d3610136650f14433adbfa3f854c944f05 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import math from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -636,6 +638,19 @@ def input_mapper_for_whisper( @MULTIMODAL_REGISTRY.register_max_multimodal_tokens( "audio", get_max_whisper_audio_tokens) class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal): + packed_modules_mapping = { + "self_attn.qkv_proj": [ + "self_attn.q_proj", + "self_attn.k_proj", + "self_attn.v_proj", + ], + "encoder_attn.kv_proj": ["encoder_attn.k_proj", "encoder_attn.v_proj"], + } + + hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={ + ".fc1.": ".mlp.fc1.", + ".fc2.": ".mlp.fc2." + }) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -729,10 +744,10 @@ class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal): def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) - mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."}) + # add fake zeros bias for k_proj to state_dict weights = _create_fake_bias_for_k_proj(weights) - return loader.load_weights(weights, mapper=mapper) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def _create_fake_bias_for_k_proj( diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index a9ce8af15d3bb5ab5b549c322c906af84a68109f..2b1294bf7baa3f72010fdf552173ef198a7736a0 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from fractions import Fraction from typing import Callable, Optional, Union diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py index b86cafce85d12ebfc095bdc25485220516ec4575..dea8b0e9d471dda820755bb171db5500126fbf81 100644 --- a/vllm/model_executor/pooling_metadata.py +++ b/vllm/model_executor/pooling_metadata.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Any, Dict, List, Tuple diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 61e8881b64f5d665780818d1abcfae77a3ef9ea5..0a580a4e907deb3ab1a014eabdeaeb6cf8ce98d5 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from array import array from dataclasses import dataclass from typing import Dict, List, Optional, Tuple diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 1c799a0685a767e940fee2b1916fc9e9eef5533a..c450b373f5db59a4f1b81d15ec9b113ee025d08a 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Utils for model executor.""" from typing import Any, Dict, Optional diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index ae46cac22748109cbca96f6d14db5f1495cc5db4..ca48dae3756bcd8faa01a771240e6475a9401358 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .base import MultiModalPlaceholderMap, MultiModalPlugin from .hasher import MultiModalHashDict, MultiModalHasher from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index de80f22bac2a32b82bad369b51be8d1ad4e0b067..f379ec1682a3c99eeecbda7a08b6f9097882c920 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import base64 from io import BytesIO from pathlib import Path diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index fd3ec7e0ec8cecd477ca52253dc5d3204e15b599..c48d07ba365ba62a56c99842726d17a3261cc15c 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from collections import defaultdict from pathlib import Path diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 24aa1ca658048191fb32b1a2ac7ab0591864f367..7d277fd67deca1425603c857586a5bdd66867248 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle from typing import TYPE_CHECKING, Iterable, Mapping, Optional diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index da13a381c453096259a40fe982779718234fc68a..98ac8057e8f18342f28318c58d8b069f41ce4323 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import base64 from functools import lru_cache from io import BytesIO diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index c8757bb3315abf255553e9b133f1fc3fa4cd866b..03fcf4afcf8d30240953123ffbd14704e60cb2f7 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -1,7 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from collections import UserDict, defaultdict from collections.abc import Mapping, Sequence from dataclasses import dataclass +from functools import partial +from itertools import accumulate from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar, Union, cast, final) @@ -137,9 +141,9 @@ Uses a list instead of a tensor if the dimensions of each element do not match. def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: """Equality check between :data:`NestedTensors` objects.""" if isinstance(a, torch.Tensor): - return isinstance(b, torch.Tensor) and bool((a == b).all().item()) + return isinstance(b, torch.Tensor) and torch.equal(a, b) elif isinstance(b, torch.Tensor): - return isinstance(a, torch.Tensor) and bool((b == a).all().item()) + return isinstance(a, torch.Tensor) and torch.equal(b, a) if isinstance(a, list): return (isinstance(b, list) @@ -161,54 +165,120 @@ A dictionary containing nested tensors which have been batched via @dataclass(frozen=True) class MultiModalFieldElem: - """Contains metadata and data of an item in :class:`MultiModalKwargs`.""" - field: "BaseMultiModalField" + """ + Represents a keyword argument corresponding to a multi-modal item + in :class:`MultiModalKwargs`. + """ + + modality: str + """ + The modality of the corresponding multi-modal item. + Each multi-modal item can consist of multiple keyword arguments. + """ + + key: str + """ + The key of this field in :class:`MultiModalKwargs`, + i.e. the name of the keyword argument to be passed to the model. + """ + data: NestedTensors + """ + The tensor data of this field in :class:`MultiModalKwargs`, + i.e. the value of the keyword argument to be passed to the model. + """ + + field: "BaseMultiModalField" + """ + Defines how to combine the tensor data of this field with others + in order to batch multi-modal items together for model inference. + """ def __eq__(self, other: object) -> bool: if not isinstance(other, self.__class__): return False - return (self.field == other.field - and nested_tensors_equal(self.data, other.data)) + return ((self.modality, self.key) == (other.modality, other.key) + and nested_tensors_equal(self.data, other.data) + and type(self.field) == type(other.field)) # noqa: E721 @dataclass(frozen=True) class BaseMultiModalField(ABC): - """Abstract base class for a field in :class:`MultiModalKwargs`.""" - key: str - modality: str + """ + Defines how to interpret tensor data belonging to a keyword argument in + :class:`MultiModalKwargs` for multiple multi-modal items, and vice versa. + """ + + def _field_factory(self, *, modality: str, key: str): + f = partial( + MultiModalFieldElem, + modality=modality, + key=key, + field=self, + ) + + # Allow passing data as positional argument + def factory(data: NestedTensors) -> MultiModalFieldElem: + return f(data=data) + + return factory @abstractmethod - def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + def build_elems( + self, + modality: str, + key: str, + data: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + """ + Construct :class:`MultiModalFieldElem` instances to represent + the provided data. + + This is the inverse of :meth:`reduce_data`. + """ raise NotImplementedError - def _build_elem(self, data: NestedTensors) -> MultiModalFieldElem: - return MultiModalFieldElem(self, data) + @abstractmethod + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + raise NotImplementedError - def reduce(self, batch: list[MultiModalFieldElem]) -> MultiModalFieldElem: - """Merge multiple instances of :class:`MultiModalFieldElem` together.""" - fields = [item.field for item in batch] - if len(set(fields)) > 1: - raise ValueError(f"Cannot merge different {fields=}") + def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors: + """ + Merge the data from multiple instances of :class:`MultiModalFieldElem`. - data = self._reduce_data([item.data for item in batch]) + This is the inverse of :meth:`build_elems`. + """ + field_types = [type(item.field) for item in elems] + if len(set(field_types)) > 1: + raise ValueError(f"Cannot merge different {field_types=}") - return self._build_elem(data) + return self._reduce_data([item.data for item in elems]) @dataclass(frozen=True) class MultiModalBatchedField(BaseMultiModalField): """ - A :class:`BaseMultiModalField` implementation where an element in the batch - is obtained by indexing into the first dimension of the underlying data. + See also: + :func:`MultiModalFieldConfig.batched` """ - def build_elems(self, batch: NestedTensors) -> list[MultiModalFieldElem]: - return [self._build_elem(item) for item in batch] + def build_elems( + self, + modality: str, + key: str, + data: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + field_factory = self._field_factory(modality=modality, key=key) + return [field_factory(item) for item in data] def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): + if len(batch) == 1: + # An optimization when `batch` contains only one tensor: + # - produce exactly same result as `torch.stack(batch)` + # - will achieve zero-copy if the tensor is contiguous + return batch[0].unsqueeze(0).contiguous() first_shape = batch[0].shape if all(elem.shape == first_shape for elem in batch): return torch.stack(batch) @@ -219,19 +289,28 @@ class MultiModalBatchedField(BaseMultiModalField): @dataclass(frozen=True) class MultiModalFlatField(BaseMultiModalField): """ - A :class:`BaseMultiModalField` implementation where an element in the batch - is obtained by slicing along the first dimension of the underlying data. + See also: + :func:`MultiModalFieldConfig.flat` + :func:`MultiModalFieldConfig.flat_from_sizes` """ + slices: Sequence[slice] def build_elems( self, - batch: NestedTensors, - slices: Sequence[slice], - ) -> list[MultiModalFieldElem]: - return [self._build_elem(batch[slice_]) for slice_ in slices] + modality: str, + key: str, + data: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + field_factory = self._field_factory(modality=modality, key=key) + return [field_factory(data[s]) for s in self.slices] def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): + if len(batch) == 1: + # An optimization when `batch` contains only one tensor: + # - produce exactly same result as `torch.concat(batch)` + # - will achieve zero-copy if the tensor is contiguous + return batch[0].contiguous() first_shape = batch[0].shape if all(elem.shape[1:] == first_shape[1:] for elem in batch): return torch.concat(batch) @@ -239,42 +318,175 @@ class MultiModalFlatField(BaseMultiModalField): return [e for elem in batch for e in elem] +@dataclass(frozen=True) +class MultiModalSharedField(BaseMultiModalField): + """ + See also: + :func:`MultiModalFieldConfig.shared` + """ + batch_size: int + + def build_elems( + self, + modality: str, + key: str, + data: NestedTensors, + ) -> Sequence[MultiModalFieldElem]: + field_factory = self._field_factory(modality=modality, key=key) + return [field_factory(data)] * self.batch_size + + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + return batch[0] + + class MultiModalFieldConfig: @staticmethod def batched(modality: str): + """ + Defines a field where an element in the batch is obtained by + indexing into the first dimension of the underlying data. + + Args: + modality: The modality of the multi-modal item that uses this + keyword argument. + + Example: + + .. code-block:: + + Input: + Data: [[AAAA] + [BBBB] + [CCCC]] + + Output: + Element 1: [AAAA] + Element 2: [BBBB] + Element 3: [CCCC] + """ return MultiModalFieldConfig( - field_cls=MultiModalBatchedField, + field=MultiModalBatchedField(), modality=modality, ) @staticmethod def flat(modality: str, slices: Sequence[slice]): + """ + Defines a field where an element in the batch is obtained by + slicing along the first dimension of the underlying data. + + Args: + modality: The modality of the multi-modal item that uses this + keyword argument. + slices: For each multi-modal item, a slice that is used to extract + the data corresponding to it. + + Example: + + .. code-block:: + + Given: + slices: [slice(0, 3), slice(3, 7), slice(7, 9)] + + Input: + Data: [AAABBBBCC] + + Output: + Element 1: [AAA] + Element 2: [BBBB] + Element 3: [CC] + """ return MultiModalFieldConfig( - field_cls=MultiModalFlatField, + field=MultiModalFlatField(slices=slices), modality=modality, - slices=slices, ) - def __init__( - self, - field_cls: type[BaseMultiModalField], - modality: str, - **field_config: Any, - ) -> None: + @staticmethod + def flat_from_sizes(modality: str, size_per_item: torch.Tensor): + """ + Defines a field where an element in the batch is obtained by + slicing along the first dimension of the underlying data. + + Args: + modality: The modality of the multi-modal item that uses this + keyword argument. + slices: For each multi-modal item, the size of the slice that + is used to extract the data corresponding to it. + + Example: + + .. code-block:: + + Given: + size_per_item: [3, 4, 2] + + Input: + Data: [AAABBBBCC] + + Output: + Element 1: [AAA] + Element 2: [BBBB] + Element 3: [CC] + + See also: + :func:`MultiModalFieldConfig.flat` + """ + + slice_idxs = [0, *accumulate(size_per_item)] + slices = [ + slice(slice_idxs[i], slice_idxs[i + 1]) + for i in range(len(size_per_item)) + ] + + return MultiModalFieldConfig.flat(modality, slices) + + @staticmethod + def shared(modality: str, batch_size: int): + """ + Defines a field where an element in the batch is obtained by + taking the entirety of the underlying data. + + This means that the data is the same for each element in the batch. + + Args: + modality: The modality of the multi-modal item that uses this + keyword argument. + batch_size: The number of multi-modal items which share this data. + + Example: + + .. code-block:: + + Given: + batch_size: 4 + + Input: + Data: [XYZ] + + Output: + Element 1: [XYZ] + Element 2: [XYZ] + Element 3: [XYZ] + Element 4: [XYZ] + """ + return MultiModalFieldConfig( + field=MultiModalSharedField(batch_size), + modality=modality, + ) + + def __init__(self, field: BaseMultiModalField, modality: str) -> None: super().__init__() - self.field_cls = field_cls + self.field = field self.modality = modality - self.field_config = field_config def build_elems( self, key: str, batch: NestedTensors, ) -> Sequence[MultiModalFieldElem]: - field = self.field_cls(key=key, modality=self.modality) - return field.build_elems(batch, **self.field_config) # type: ignore + return self.field.build_elems(self.modality, key, batch) class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): @@ -285,11 +497,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): @staticmethod def from_elems(elems: Sequence[MultiModalFieldElem]): - return MultiModalKwargsItem({elem.field.key: elem for elem in elems}) + return MultiModalKwargsItem({elem.key: elem for elem in elems}) @property def modality(self) -> str: - modalities = {elem.field.modality for elem in self.data.values()} + modalities = {elem.modality for elem in self.data.values()} assert len(modalities) == 1, f"Found different modalities={modalities}" return next(iter(modalities)) @@ -349,7 +561,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): elems_by_key[key].append(elem) data = { - key: elems[0].field.reduce(elems).data + key: elems[0].field.reduce_data(elems) for key, elems in elems_by_key.items() if len(elems) > 0 } @@ -394,6 +606,12 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): return stacked tensors_ = cast(list[torch.Tensor], stacked) + if len(tensors_) == 1: + # An optimization when `tensors_` contains only one tensor: + # - produce exactly same result as `torch.stack(tensors_)` + # - will achieve zero-copy if the tensor is contiguous + return tensors_[0].unsqueeze(0).contiguous() + if any(t.shape != tensors_[0].shape for t in tensors_): # The tensors have incompatible shapes and can't be stacked. return tensors_ diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index ccff0e857eec4a044c24e47ca8f2fdf247c5a86a..063f458b2c4d958b578e3632c18a9057cb52838f 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from collections import UserDict from collections.abc import Callable, Iterator, Mapping, Sequence diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 750646ac6e4315ea561c9e22514851b63429755a..d704fa59b96af1514d478b12de8e6adfa7c333b9 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import re from abc import ABC, abstractmethod from collections import defaultdict @@ -678,7 +680,11 @@ class BaseProcessingInfo: raise NotImplementedError @abstractmethod - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: """ Get the maximum possible number of tokens per data item for each modality. diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index c68edaff80167fb772eaf34bdeae666870f8305f..5dd7548540448c17d3395c30aeb17d8c0f85fa59 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from collections.abc import Mapping from dataclasses import dataclass, field @@ -149,7 +151,8 @@ class MultiModalProfiler(Generic[_I]): mm_counts = self.get_mm_limits() info = self.processing_info - mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len) + mm_max_tokens_per_item = info.get_mm_max_tokens_per_item( + seq_len, mm_counts) if mm_counts.keys() != mm_max_tokens_per_item.keys(): raise AssertionError( diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 7a4b85385cac9be0b6605ff06fdb20014670a0b9..04141114288c9339b1d636e7741379edd6fe3b15 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import functools from collections import UserDict from dataclasses import dataclass @@ -262,7 +264,9 @@ class MultiModalRegistry: ) processor = self.create_processor(model_config, tokenizer) seq_len = model_config.max_model_len - return processor.info.get_mm_max_tokens_per_item(seq_len) + mm_limits = self.get_mm_limits_per_prompt(model_config) + return processor.info.get_mm_max_tokens_per_item( + seq_len, mm_limits) return { key: plugin.get_max_multimodal_tokens(model_config) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 6f27fa76c21930b700e355146cf80cf08ee79b76..ab450342ee76775f4a4ef177788ab983987583d1 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import lru_cache from itertools import groupby from pathlib import Path diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 1ad1f5abc27a2b469de8d380aa0c77eb2cbc1811..78a2918e3ed3e79a35a7ee2c6406fe2b9a977383 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,10 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 + import base64 from functools import lru_cache, partial from io import BytesIO from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Optional -import cv2 import numpy as np import numpy.typing as npt from PIL import Image @@ -93,6 +94,8 @@ def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray: new_height, new_width = size resized_frames = np.empty((num_frames, new_height, new_width, channels), dtype=frames.dtype) + # lazy import cv2 to avoid bothering users who only use text models + import cv2 for i, frame in enumerate(frames): resized_frame = cv2.resize(frame, (new_width, new_height)) resized_frames[i] = resized_frame diff --git a/vllm/outputs.py b/vllm/outputs.py index 25b2265285d16db06584bfe47566e6e0919613d0..786380c37f6cb588f10b7be40871b9311171e3b9 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from dataclasses import dataclass from typing import Dict, Generic, List, MutableSequence, Optional diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 8ac5160e91f9ffa873b525fc004b9bae60ff91f7..e3751bd4cf9b9310c89ebd6c69d314ecad836c2f 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging import traceback from itertools import chain @@ -32,14 +34,19 @@ def cuda_platform_plugin() -> Optional[str]: is_cuda = False try: - import pynvml + from vllm.utils import import_pynvml + pynvml = import_pynvml() pynvml.nvmlInit() try: if pynvml.nvmlDeviceGetCount() > 0: is_cuda = True finally: pynvml.nvmlShutdown() - except Exception: + except Exception as e: + if "nvml" not in e.__class__.__name__.lower(): + # If the error is not related to NVML, re-raise it. + raise e + # CUDA is supported on Jetson, but NVML may not be. import os diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 4483066191e59e9417bbc0c17b7518b7aa91ee51..b7cea7efc325a7366139dc165bb67a9df26cb95a 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import TYPE_CHECKING, Optional diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 91dcdff006e3e396d94808a8ea3e21938ebbbbcd..991d55ac861a470807dd24fcb87507373226f6fa 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Code inside this file can safely assume cuda platform, e.g. importing pynvml. However, it should not initialize cuda context. """ @@ -7,7 +8,6 @@ from functools import lru_cache, wraps from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar, Union) -import pynvml import torch from typing_extensions import ParamSpec @@ -15,6 +15,7 @@ from typing_extensions import ParamSpec import vllm._C # noqa import vllm.envs as envs from vllm.logger import init_logger +from vllm.utils import import_pynvml from .interface import DeviceCapability, Platform, PlatformEnum, _Backend @@ -28,13 +29,7 @@ logger = init_logger(__name__) _P = ParamSpec("_P") _R = TypeVar("_R") -if pynvml.__file__.endswith("__init__.py"): - logger.warning( - "You are using a deprecated `pynvml` package. Please install" - " `nvidia-ml-py` instead, and make sure to uninstall `pynvml`." - " When both of them are installed, `pynvml` will take precedence" - " and cause errors. See https://pypi.org/project/pynvml " - "for more information.") +pynvml = import_pynvml() # pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models # see https://github.com/huggingface/diffusers/issues/9704 for details @@ -280,6 +275,14 @@ class NvmlCudaPlatform(CudaPlatformBase): physical_device_id = device_id_to_physical_device_id(device_id) return cls._get_physical_device_name(physical_device_id) + @classmethod + @lru_cache(maxsize=8) + @with_nvml_context + def get_device_uuid(cls, device_id: int = 0) -> str: + physical_device_id = device_id_to_physical_device_id(device_id) + handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id) + return pynvml.nvmlDeviceGetUUID(handle) + @classmethod @lru_cache(maxsize=8) @with_nvml_context diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 0e1c4c0c5949f3bc4aff2323662aff6d6c6a5e77..78ddb67bb3fa31d9c7334bc61eea32a100765258 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import TYPE_CHECKING, Optional diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 0854126ed42d69d68ebb5c8ce63118385af489e0..3ce565d44127c019ed3da9283e53b012a28e7321 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum import platform import random @@ -181,6 +183,11 @@ class Platform: """Get the name of a device.""" raise NotImplementedError + @classmethod + def get_device_uuid(cls, device_id: int = 0) -> str: + """Get the uuid of a device, e.g. the PCI bus ID.""" + raise NotImplementedError + @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: """Get the total memory of a device in bytes.""" diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 23a7126fb05cfbcf5b314aa001879aee2638bd55..5a03f5f7acbc1330453bacc46aab526eabbb5767 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, Optional from vllm.logger import init_logger diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index 3282c061714d35bb42f6d7bb0fe0dfaccc71e8fb..41221de0afe509556cf8a01b84dcbcb14c137e80 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, Optional import torch diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 70969ef00f4d8a956c224d5e45e469e929cd6f32..ab16c380e927201002d66452a204ae45ea0c8b18 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from functools import lru_cache from typing import TYPE_CHECKING, Dict, List, Optional @@ -77,6 +79,9 @@ class RocmPlatform(Platform): def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, use_mla) -> str: + if use_mla: + logger.info("Using Triton MLA backend.") + return "vllm.attention.backends.triton_mla.TritonMLABackend" selected_backend = (_Backend.ROCM_FLASH if selected_backend == _Backend.FLASH_ATTN else selected_backend) if selected_backend == _Backend.ROCM_FLASH: diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index a6e72231b0ab675cf2997e2a5cfb7bc8d00261c8..0611ca66009008bf98344dd229853c34341f4f54 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, Optional import torch diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index a5ca77f57cf47bcfec4d289a1a142f7a4a9f1e5d..81bc85f9415e83f8b593c61efa74be307ffe0379 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, Optional import torch @@ -66,9 +68,14 @@ class XPUPlatform(Platform): # check and update model config model_config = vllm_config.model_config if model_config.dtype == torch.bfloat16: - logger.warning( - "bfloat16 is not fully supported on XPU, casting to float16.") - model_config.dtype = torch.float16 + bf16_supported = cls.device_support_bf16() + if not bf16_supported: + logger.warning( + "bfloat16 is only supported on Intel Data Center GPU, " + "Intel Arc GPU is not supported yet. Your device is %s," + "which is not supported. will fallback to float16", + cls.get_device_name()) + model_config.dtype = torch.float16 if not model_config.enforce_eager: logger.warning( "CUDA graph is not supported on XPU, fallback to the eager " @@ -116,3 +123,15 @@ class XPUPlatform(Platform): ) -> float: torch.xpu.reset_peak_memory_stats(device) return torch.xpu.max_memory_allocated(device) + + @classmethod + def device_support_bf16(cls) -> bool: + device_name = cls.get_device_name().lower() + if device_name.count("arc") > 0: + return False + elif device_name.count("data center gpu") > 0: + return True + else: + logger.warning("Unknown device name %s, always use float16", + device_name) + return False diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index a78a054917756fdc30ade7ccdf316dde5ba03232..389cb8728103189b501f8a936fdeb02b194a12a1 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging import os from typing import Callable, Dict diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index b24b7e91a7ae7a1371d5f62b558222fca0574e52..061232eb11830bcef3acfc68281c467dcd4d478a 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Optional import msgspec diff --git a/vllm/profiler/__init__.py b/vllm/profiler/__init__.py index 3e25f5cc283f2992093910ab451191dec6ea173c..00af72b1d41fc9800f87f43bd4035aac695401ae 100644 --- a/vllm/profiler/__init__.py +++ b/vllm/profiler/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .layerwise_profile import layerwise_profile __all__ = [ diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 29c0edd0ee5352ad6067fda50f1bcfaa6fef8874..6351ef63da2bee8c3b07bcff7253a02d1251600f 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import copy from collections import defaultdict from dataclasses import asdict, dataclass, field diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py index 033035e434325a91896d9921ea90df5d82ba38c1..62b39f510703ea0f24b2cb908e40fa2d1b26accb 100644 --- a/vllm/profiler/utils.py +++ b/vllm/profiler/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import Callable, Dict, List, Type, Union diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py index 27a61e692e1b743a1cd195cd552cd3aa9ab5b1a4..c2f9f16919b7fc7bbb89899b615dc06e415ae6bd 100644 --- a/vllm/prompt_adapter/layers.py +++ b/vllm/prompt_adapter/layers.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Optional diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py index 18a5f86c341a9419be2211e166664efbaf2b025d..3ba7d0896f95ac1c395f9ffecec89cb900dbce49 100644 --- a/vllm/prompt_adapter/models.py +++ b/vllm/prompt_adapter/models.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging import math from typing import Any, Callable, Dict, List, Optional, Type diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py index 775dd11db07195221e445db92fefb941780d66fe..dfb8e61d786a0f81bf617f33a3a06957cdab02ac 100644 --- a/vllm/prompt_adapter/request.py +++ b/vllm/prompt_adapter/request.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import msgspec from vllm.adapter_commons.request import AdapterRequest diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py index 8b2732923c4e7bc6a7da6a4442e229ec9ab7a52c..dd179ab938f8346f2d48881532fa4e7d5b59798a 100644 --- a/vllm/prompt_adapter/utils.py +++ b/vllm/prompt_adapter/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420 import os diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py index ddc1ef893c6f2c70c238279aea6d767148c90766..28dcc16871120e03646df779e7bddd2b824d64ff 100644 --- a/vllm/prompt_adapter/worker_manager.py +++ b/vllm/prompt_adapter/worker_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import logging from typing import Any, Optional, Set, Type diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 605c09b8d7225166917b17b81a58f8d57ee55fb5..97f9e2129573147efc4ed4b7e00a316774e994fe 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Sampling parameters for text generation.""" import copy from dataclasses import dataclass diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py index 20063a5b4b085206a5daa71b528b5eca296074e0..9f6e85920ac7bcccca9221e508a71af8d56fbde2 100644 --- a/vllm/scalar_type.py +++ b/vllm/scalar_type.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import functools import struct from dataclasses import dataclass diff --git a/vllm/scripts.py b/vllm/scripts.py index 8101e6b3af7ee0e3244aa952490ec24a22a0ce41..467cab28f02ae48a6b0a48d575a7ead2befdb5b9 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # The CLI entrypoint to vLLM. import argparse import os diff --git a/vllm/sequence.py b/vllm/sequence.py index 1801c4e85f4fe3e83b2b16e0a7c62a3ce8000631..aa138a1a32e089c0dffd25f597b3736f011896df 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Sequence and its related classes.""" import copy import enum diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index bbf1bdc650503db9a22345826fac1416afe1cfa3..0097efddad9c2723e2a32d3c45e73781ba1488be 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from array import array from itertools import chain, count from typing import Iterator, List, Optional, Tuple diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index fe5fd39f42ac9219ed1d809c2bed1c5f328ebd20..3948298db40c210a6c390ed0ca35f341d80995dc 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional import torch diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index 40eeecd7d96a1d150e17481b68bd46456e77b3f1..369eafd2e5f2802b0ce89eb4d13f4df5974dab72 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from dataclasses import dataclass from typing import List, Optional, Set, Union diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py index 45074dbd828aaf939fe3c5dcb9add085ba253c53..897e85b65cf59613d7ba4ccd70a47275bd2b302a 100644 --- a/vllm/spec_decode/medusa_worker.py +++ b/vllm/spec_decode/medusa_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import weakref from typing import List, Optional, Set, Tuple, Dict diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index d678f4578499bd60b2e2c5737937cd3ec7ac1d6c..bc0e0a121cd55363d8bc902747b5e0f4f3f77f56 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from typing import Callable, Optional, Union diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py index cb7f30ccf766a6284003f7e0351f528d46ce69cc..168223852464839cf9cf1b9c864d67365b74939e 100644 --- a/vllm/spec_decode/mlp_speculator_worker.py +++ b/vllm/spec_decode/mlp_speculator_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Set, Tuple, Dict import torch diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py index 3aea2eabb4144bfee44b04463576bfe28939678f..6275c460ecefa0aaca2fe2d6be7e3dc90ccd3aa0 100644 --- a/vllm/spec_decode/mqa_scorer.py +++ b/vllm/spec_decode/mqa_scorer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.sequence import (ExecuteModelRequest, SequenceData, SequenceGroupMetadata, get_all_seq_ids) from vllm.spec_decode.interfaces import (SpeculativeProposals, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 4e3c8b418aefb718c265a1b8177a1a2da1947e70..04812c5ed45e44c442457f69f0f396094a06b1c6 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import copy import weakref from typing import Dict, List, Set, Tuple diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index e906b1789cde8dc426f8b039037f5dafd80e26c3..86390c99c2fbced6163eac2374cac7afe681b602 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import weakref from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index 28a537593f26dd584a205d386e516ca2f7707bd9..2bebf80fadae5e3e637053f95740340bd6a98f7f 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index c6ff5e52f938898c4773931996854a7564c1cc99..a1466ba5db756d59f9ea4e709d27a343fd0943a7 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Set, Tuple import torch diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index a1d6264eb125bdac86363352d3b7b06a047a2e84..9c1cd60c39c1868d59b0ec108c2568da224959ac 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import copy from collections import defaultdict diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py index 56540744b73a9f3eaf6d4805c1785f5019da9b0d..08e773c562bf83f7f9fd3928b0aa30881f25e526 100644 --- a/vllm/spec_decode/target_model_runner.py +++ b/vllm/spec_decode/target_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional from vllm.sequence import SequenceGroupMetadata diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 6bf7587cdda19020b4d1efc6557b792f3668a42c..b538923c03e74a95e307302bfb9f2d2fe976d838 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Set, Tuple import torch diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 595ad46479a094a8c91ed4c0db277836b2cf1177..410653864c4ecafe8d19d2572ad1eba0c960e00a 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from contextlib import contextmanager from typing import Dict, List, Optional, Sequence, Tuple diff --git a/vllm/tracing.py b/vllm/tracing.py index 72a3f85118d36b574ab6456590db593d1327df82..bf069ad84fd42b815452b6dd72cb3fedde20e053 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Mapping, Optional diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py index eeec029fc051ac0819445630003415e685e4a05a..01d5bb4b574895b2cd5ec7f515c4b6d17d85b707 100644 --- a/vllm/transformers_utils/__init__.py +++ b/vllm/transformers_utils/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.envs import VLLM_USE_MODELSCOPE if VLLM_USE_MODELSCOPE: diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 5805f4ad0b7f75c873b80c9a838a49025541ef9f..1c0f20a6e045b23e52ebdc55705db7e4c79579cb 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum import json import os diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index f065c56124605e103cbd3b8ff44024e07fe9b053..c484a755ab4ec2c5cc00de5bf4395d79cf55a59f 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.cohere2 import Cohere2Config from vllm.transformers_utils.configs.dbrx import DbrxConfig diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 7780bf5e78d6dbe0877f7ac8e6d7875189f5da20..6625ccf0f2a84eecf3b78a14f401027f1e6064dd 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # yapf: disable # ruff: noqa: E501 # coding=utf-8 diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py index e563bf6268d72ddebaa2667b35bcf733c8203729..43e9503ffe03f5e4eae1675b3cc215c393591df6 100644 --- a/vllm/transformers_utils/configs/chatglm.py +++ b/vllm/transformers_utils/configs/chatglm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://github.com/THUDM/ChatGLM2-6B from transformers import PretrainedConfig diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py index 1509330fc2179f08c5882cb61f306884ec415af5..e30409b3af5f0c97001dca5b03ae3f56ea2a220f 100644 --- a/vllm/transformers_utils/configs/cohere2.py +++ b/vllm/transformers_utils/configs/cohere2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # ruff: noqa # Adapted from diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py index 0dc9664723d346dfcec31a56b93242451b84282e..8f40b2b7df7ab1b68b7efacb8e74b3010c3254a9 100644 --- a/vllm/transformers_utils/configs/dbrx.py +++ b/vllm/transformers_utils/configs/dbrx.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # yapf: disable # ruff: noqa: E501 # coding=utf-8 diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py index 681528c3c011673b125ef7d263a2ecc3b5dbbdf5..24d4052d872116e2978e2c2d757091a7531b2ff7 100644 --- a/vllm/transformers_utils/configs/deepseek_vl2.py +++ b/vllm/transformers_utils/configs/deepseek_vl2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268 from typing import Tuple diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index b357a785e4dc4dc87fc8c3fae0fc9b5c14c6d72a..b26aba66699fdbd713fade7b770d379194ff2e05 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Optional, Union diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py index f60a59f5541337164f5d9da765ad3519e9c715b8..39364367e30316d1f8540825168b5b35aeb0ce32 100644 --- a/vllm/transformers_utils/configs/exaone.py +++ b/vllm/transformers_utils/configs/exaone.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copied from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py # Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved. diff --git a/vllm/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py index c82cc6065c7eab9d8f7747fc1764f612a78ed6c2..f161a06f34238204ec034a21a88dbf92c047082a 100644 --- a/vllm/transformers_utils/configs/falcon.py +++ b/vllm/transformers_utils/configs/falcon.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py # Copyright 2023 The vLLM team. diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py index b94c5b77e4b7faa50bfcd59f2f0a5472d11a6d6f..48b5d79ff950ba6f4332bf0cf0c6b4a194a4fca6 100644 --- a/vllm/transformers_utils/configs/h2ovl.py +++ b/vllm/transformers_utils/configs/h2ovl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py # -------------------------------------------------------- diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py index ac2492317aa36c383ddc54f1e88f63036c00aeca..8ea62546e21336107ac7bb509b857978f700f13e 100644 --- a/vllm/transformers_utils/configs/internvl.py +++ b/vllm/transformers_utils/configs/internvl.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py # -------------------------------------------------------- diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index 82f129eb2018ee3a6c519408f89a76c6df59890d..0cab2c42e57913da241cd09f9c9365a4d26d18c2 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright 2023 Cerebras Systems. diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py index 050a32d26fb2be788dea00a0bf98def914dd8cbc..39328647d78dffb5dda30ea5c489e2bb4d72ff17 100644 --- a/vllm/transformers_utils/configs/medusa.py +++ b/vllm/transformers_utils/configs/medusa.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import Optional, Union, List diff --git a/vllm/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py index 49e766d7fa1f4998ad76b279e8b644bdb2e33810..eb77e09adca489bf0aecf46ef66d15884d3cdb56 100644 --- a/vllm/transformers_utils/configs/mllama.py +++ b/vllm/transformers_utils/configs/mllama.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from transformers.models.mllama import configuration_mllama as mllama_hf_config diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py index 946af4e919f7c4e9825126b016dcec75cfe2bf1f..c761f659e5b2c3f022c016352e19a297c72ef183 100644 --- a/vllm/transformers_utils/configs/mlp_speculator.py +++ b/vllm/transformers_utils/configs/mlp_speculator.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional from transformers import PretrainedConfig diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 0f047c8b0361cb5985a8e0d7c38cff0011377d57..96356135f6b28e6c3043efc6c5b000b37e254e7d 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copied from # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py """A HuggingFace-style model configuration.""" diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 1edf36329d83bbb33582c04e1de5a32e73b2107a..fdf4fa2a53e5706eeed69f0ce8a6d1ec81584d74 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2024 HuggingFace Inc. team. All rights reserved. # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py index 8007176aecd907a296bc9f0e4092646e039ffb21..300f6e21168e55e7d15a9fc87a41878ccaa82cbb 100644 --- a/vllm/transformers_utils/configs/nvlm_d.py +++ b/vllm/transformers_utils/configs/nvlm_d.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py # -------------------------------------------------------- diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py index 0e6d8e4879b06a306a7ef8f6137d7318469a0f51..c6e446333b43d0aed0dd133bcea5cb463b1e0c15 100644 --- a/vllm/transformers_utils/configs/olmo2.py +++ b/vllm/transformers_utils/configs/olmo2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # yapf: disable # ruff: noqa: E501 # coding=utf-8 diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py index 0c1c048f670eec7b41e08006ca092223f6e9767e..0d5db896b93d360c5026ff163da7d0ccd11f183c 100644 --- a/vllm/transformers_utils/configs/solar.py +++ b/vllm/transformers_utils/configs/solar.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py index eb6f5a059169f9d0b3f7cb924371dbdba33e2377..5da6c5b4427ea1f7ed317467ed535f887c63330c 100644 --- a/vllm/transformers_utils/configs/telechat2.py +++ b/vllm/transformers_utils/configs/telechat2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py """ Telechat configuration compatible with LlamaConfig. """ diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index f724bf7f2f1cd9e862020c3a0f84f957baecd5ab..99715ba6d0b09e18f8988a46cc7e78549e597f3f 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py from typing import Any, Dict, Optional diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index 7c8423d2b0a3490039f1280bb1317d8611a1b92b..9d1d4bb92e4ab5043007f79b69ccd14ef380e51b 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List, Optional from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams, diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py index 37ff8a236e7918d13c10aabf9ac745f14e56d50b..8160a35ff2228f1eb503ab9d001bf808ff097c86 100644 --- a/vllm/transformers_utils/detokenizer_utils.py +++ b/vllm/transformers_utils/detokenizer_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional, Tuple from .tokenizer import AnyTokenizer diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index b12cc83a2297095dec75db13cc53971bcea3bb9c..3197b07d8a4687a999695f1972755c87e5dfe366 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from functools import lru_cache from typing import Any, cast diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py index 9c71b8cada32e37e3423848d0c5fac734000b673..4696f0c49df96dfe3969d4a3a8bdb98dd18b216f 100644 --- a/vllm/transformers_utils/processors/__init__.py +++ b/vllm/transformers_utils/processors/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.transformers_utils.processors.deepseek_vl2 import ( DeepseekVLV2Processor) diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py index 27cdf6bc22d0eaa0085ab576950d9c35818f84fe..d37381ea9925fc23d1f926ca8d1f7e42c49eb1e4 100644 --- a/vllm/transformers_utils/processors/deepseek_vl2.py +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # yapf: disable # ruff: noqa: E501 # coding=utf-8 diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index 74a56cbf57ec323b712c1c0c0f294e7033f3c415..4fe744d285d35a2e117042f26a7c6fe44ef08958 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import fnmatch import os import shutil diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 62474089529528255aa53086a7634fdf9aba4176..bfc0df1df3001dc8ab666af8a72bc6e53b6162ff 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import contextlib import os import warnings diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index 09569c564a58dd78c37fbe71c9f86993e782c454..c223768b16d6b73c42c73ce9d74b2a7092917a94 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Optional, Type from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py index e6cc7cd4e2e3ac26bd856deda3e8034c09553914..fbdfa3e57e1721ff1aae54c3127269e3c1c43769 100644 --- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from abc import ABC, abstractmethod from typing import List, Optional diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py index 3f7627e11ae5effc032d502aa299b47947270c53..30cab752ccf3c832b6f076b742776b082f503120 100644 --- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os from typing import List, Optional diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index 6dc2f905618739c08ae06d85c7848896867497a3..025971cb7e47787a81311f02901874b66a197925 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Optional from vllm.config import TokenizerPoolConfig diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py index e68ad79b296b8f97767b1f4462edc7dcb085edbe..2b64f3fc70569975f9b67e1c58733c55df9c19ab 100644 --- a/vllm/transformers_utils/tokenizers/__init__.py +++ b/vllm/transformers_utils/tokenizers/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from .mistral import MistralTokenizer, maybe_serialize_tool_calls __all__ = ["MistralTokenizer", "maybe_serialize_tool_calls"] diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index d801cf4e4c7b1bd33da25894fd5796cad5b15936..1550f978ed2013bf9ba392dade145a7f2e20b4e4 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import re from dataclasses import dataclass @@ -6,21 +8,18 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast import huggingface_hub from huggingface_hub import HfApi, hf_hub_download -from mistral_common.protocol.instruct.request import ChatCompletionRequest -from mistral_common.tokens.tokenizers.base import SpecialTokens -# yapf: disable -from mistral_common.tokens.tokenizers.mistral import ( - MistralTokenizer as PublicMistralTokenizer) -# yapf: enable -from mistral_common.tokens.tokenizers.sentencepiece import ( - SentencePieceTokenizer) -from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy, - Tekkenizer) from vllm.logger import init_logger from vllm.utils import is_list_of if TYPE_CHECKING: + # make sure `mistral_common` is lazy imported, + # so that users who only use non-mistral models + # will not be bothered by the dependency. + from mistral_common.protocol.instruct.request import ChatCompletionRequest + from mistral_common.tokens.tokenizers.mistral import ( + MistralTokenizer as PublicMistralTokenizer) + from vllm.entrypoints.chat_utils import ChatCompletionMessageParam logger = init_logger(__name__) @@ -31,7 +30,7 @@ class Encoding: input_ids: Union[List[int], List[List[int]]] -def maybe_serialize_tool_calls(request: ChatCompletionRequest): +def maybe_serialize_tool_calls(request: "ChatCompletionRequest"): # SEE: https://github.com/vllm-project/vllm/pull/9951 # Credits go to: @gcalmettes # NOTE: There is currently a bug in pydantic where attributes @@ -106,12 +105,16 @@ def find_tokenizer_file(files: List[str]): class MistralTokenizer: - def __init__(self, tokenizer: PublicMistralTokenizer) -> None: + def __init__(self, tokenizer: "PublicMistralTokenizer") -> None: self.mistral = tokenizer self.instruct = tokenizer.instruct_tokenizer tokenizer_ = tokenizer.instruct_tokenizer.tokenizer + from mistral_common.tokens.tokenizers.tekken import ( + SpecialTokenPolicy, Tekkenizer) self.is_tekken = isinstance(tokenizer_, Tekkenizer) + from mistral_common.tokens.tokenizers.sentencepiece import ( + SentencePieceTokenizer) self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer) if self.is_tekken: # Make sure special tokens will not raise @@ -151,6 +154,8 @@ class MistralTokenizer: assert Path( path_or_repo_id).is_file(), f"Invalid path: {path_or_repo_id}" + from mistral_common.tokens.tokenizers.mistral import ( + MistralTokenizer as PublicMistralTokenizer) mistral_tokenizer = PublicMistralTokenizer.from_file(tokenizer_file) return cls(mistral_tokenizer) @@ -179,6 +184,8 @@ class MistralTokenizer: # by the guided structured output backends. @property def all_special_tokens_extended(self) -> List[str]: + from mistral_common.tokens.tokenizers.base import SpecialTokens + # tekken defines its own extended special tokens list if hasattr(self.tokenizer, "SPECIAL_TOKENS"): special_tokens = self.tokenizer.SPECIAL_TOKENS @@ -282,6 +289,8 @@ class MistralTokenizer: if last_message["role"] == "assistant": last_message["prefix"] = True + from mistral_common.protocol.instruct.request import ( + ChatCompletionRequest) request = ChatCompletionRequest(messages=messages, tools=tools) # type: ignore[type-var] encoded = self.mistral.encode_chat_completion(request) @@ -290,6 +299,7 @@ class MistralTokenizer: return encoded.tokens def convert_tokens_to_string(self, tokens: List[str]) -> str: + from mistral_common.tokens.tokenizers.base import SpecialTokens if self.is_tekken: tokens = [ t for t in tokens @@ -361,6 +371,8 @@ class MistralTokenizer: ids: List[int], skip_special_tokens: bool = True, ) -> List[str]: + from mistral_common.tokens.tokenizers.base import SpecialTokens + # TODO(Patrick) - potentially allow special tokens to not be skipped assert ( skip_special_tokens diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index 10a09fb4f566ccbdc883d316acb9ec6bdbe10b61..71fe3ef0b23c5d3b43ac515db2aeaae96cea59f1 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from os import PathLike from pathlib import Path from typing import Union diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py index 568185383aa5c36854e8548357f39974dbe7ebd6..c8f7a32ce7a8c5cf75165e1778cf3be04532a921 100644 --- a/vllm/triton_utils/__init__.py +++ b/vllm/triton_utils/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from vllm.triton_utils.importing import HAS_TRITON __all__ = ["HAS_TRITON"] diff --git a/vllm/triton_utils/custom_cache_manager.py b/vllm/triton_utils/custom_cache_manager.py index 02b65fbf5e98d1f369253ea3a8717bb2973bbdd1..9fa4666cd3e3e429ad996386612c57dd472ea07c 100644 --- a/vllm/triton_utils/custom_cache_manager.py +++ b/vllm/triton_utils/custom_cache_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from triton.runtime.cache import (FileCacheManager, default_cache_dir, diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index 0c96e0632f64635da30889639a850576cebf8ad4..a20700248c26872202e703b3283289b7e99785a9 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from importlib.util import find_spec from vllm.logger import init_logger diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 7f5cc906382afa3ac6b00d8b7cb36060dc93affd..fbbb21c89370a1e8f8b7155c5220acad3d870ff8 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import datetime import json import logging diff --git a/vllm/utils.py b/vllm/utils.py index 5deea1082b323e5f9497c380f005750b8c09cbf8..815e7182428e1cbacf937a419beee22d7b247340 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import argparse import asyncio import concurrent @@ -561,6 +563,10 @@ def cdiv(a: int, b: int) -> int: return -(a // -b) +def round_up(x: int, y: int) -> int: + return ((x + y - 1) // y) * y + + def _generate_random_fp8( tensor: torch.Tensor, low: float, @@ -792,6 +798,12 @@ def get_dtype_size(dtype: torch.dtype) -> int: return torch.tensor([], dtype=dtype).element_size() +def align_to_256bytes(extent: int, dtype: torch.dtype) -> int: + dtype_size = get_dtype_size(dtype) + eles_per_256bytes = 256 // dtype_size + return round_up(extent, eles_per_256bytes) + + # `collections` helpers def is_list_of( value: object, @@ -2297,3 +2309,55 @@ def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any], else: func = partial(method, obj) # type: ignore return func(*args, **kwargs) + + +def import_pynvml(): + """ + Historical comments: + + libnvml.so is the library behind nvidia-smi, and + pynvml is a Python wrapper around it. We use it to get GPU + status without initializing CUDA context in the current process. + Historically, there are two packages that provide pynvml: + - `nvidia-ml-py` (https://pypi.org/project/nvidia-ml-py/): The official + wrapper. It is a dependency of vLLM, and is installed when users + install vLLM. It provides a Python module named `pynvml`. + - `pynvml` (https://pypi.org/project/pynvml/): An unofficial wrapper. + Prior to version 12.0, it also provides a Python module `pynvml`, + and therefore conflicts with the official one. What's worse, + the module is a Python package, and has higher priority than + the official one which is a standalone Python file. + This causes errors when both of them are installed. + Starting from version 12.0, it migrates to a new module + named `pynvml_utils` to avoid the conflict. + + TL;DR: if users have pynvml<12.0 installed, it will cause problems. + Otherwise, `import pynvml` will import the correct module. + We take the safest approach here, to manually import the correct + `pynvml.py` module from the `nvidia-ml-py` package. + """ + if TYPE_CHECKING: + import pynvml + return pynvml + if "pynvml" in sys.modules: + import pynvml + if pynvml.__file__.endswith("__init__.py"): + # this is pynvml < 12.0 + raise RuntimeError( + "You are using a deprecated `pynvml` package. " + "Please uninstall `pynvml` or upgrade to at least" + " version 12.0. See https://pypi.org/project/pynvml " + "for more information.") + return sys.modules["pynvml"] + import importlib.util + import os + import site + for site_dir in site.getsitepackages(): + pynvml_path = os.path.join(site_dir, "pynvml.py") + if os.path.exists(pynvml_path): + spec = importlib.util.spec_from_file_location( + "pynvml", pynvml_path) + pynvml = importlib.util.module_from_spec(spec) + sys.modules["pynvml"] = pynvml + spec.loader.exec_module(pynvml) + return pynvml diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index ce83b1fac6c0bcf4bdd9c3349ed9063b11d80944..837d7faf43708dbc2ece2eaa60c7283293f5f7c1 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Attention layer with FlashAttention.""" from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 9d570b334c6cf59f25e838ddaa24610630115a50..651bc01aa5cf665c46bb0def62694499ae79d793 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, Dict, List, Set, Tuple from vllm.logger import init_logger diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index d6c612f155f019608c92a75a72e928983bfb0e22..de349ec12099931b81729d45274454e6b6f73c27 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,5 +1,7 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import defaultdict -from typing import Dict, Iterable, List, Optional, Tuple +from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple from vllm.logger import init_logger from vllm.utils import cdiv @@ -67,7 +69,8 @@ class KVCacheManager: # Mapping from request ID to blocks to track the blocks allocated # for each request, so that we can free the blocks when the request # is finished. - self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {} + self.req_to_blocks: DefaultDict[str, + List[KVCacheBlock]] = defaultdict(list) @property def usage(self) -> float: @@ -115,33 +118,75 @@ class KVCacheManager: num_computed_tokens = len(computed_blocks) * self.block_size return computed_blocks, num_computed_tokens - def append_slots( + def allocate_slots( self, request: Request, num_tokens: int, + new_computed_blocks: Optional[List[KVCacheBlock]] = None ) -> Optional[List[KVCacheBlock]]: - """Append slots to the block table of the request. - We first append slots to already allocated blocks. If the allocated - blocks are not enough, we allocate new blocks. + """Add slots for a request with new tokens to append. Args: - request: The request to append slots. - num_tokens: The number of tokens to append. + request: The request to allocate slots. + num_tokens: The number of tokens to allocate. Note that this does + not include the tokens that have already been computed. + new_computed_blocks: A list of new computed blocks just hitting the + prefix caching. + + Blocks layout: + ----------------------------------------------------------------------- + | < computed > | < new computed > | < new > | < pre-allocated > | + ----------------------------------------------------------------------- + | < required > | + -------------------------------------------------- + | < full > | + ------------------------------------------------ + | | + -------------- + The following *_blocks are illustrated in this layout. Returns: - A list of new blocks if new blocks are allocated, or None - if new blocks are required but cannot be allocated. + A list of new allocated blocks. """ - num_required_blocks = cdiv(request.num_computed_tokens + num_tokens, + if num_tokens == 0: + raise ValueError("num_tokens must be greater than 0") + + new_computed_blocks = new_computed_blocks or [] + + # The number of computed tokens is the number of computed tokens plus + # the new prefix caching hits + num_computed_tokens = (request.num_computed_tokens + + len(new_computed_blocks) * self.block_size) + num_required_blocks = cdiv(num_computed_tokens + num_tokens, self.block_size) req_blocks = self.req_to_blocks[request.request_id] + num_new_blocks = (num_required_blocks - len(req_blocks) - + len(new_computed_blocks)) - num_new_blocks = num_required_blocks - len(req_blocks) - if num_new_blocks > self.free_block_queue.num_free_blocks: - # Need to allocate new blocks due to insufficient pre-allocated - # slots, but we cannot allocate new blocks due to the limit. + # If a computed block of a request is an eviction candidate (in the + # free queue and ref_cnt == 0), it cannot be counted as a free block + # when allocating this request. + num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks + if blk.ref_cnt == 0) + if (num_new_blocks > self.free_block_queue.num_free_blocks - + num_evictable_computed_blocks): + # Cannot allocate new blocks return None + # Touch the computed blocks to make sure they won't be evicted. + if self.enable_caching: + self._touch(new_computed_blocks) + else: + assert not new_computed_blocks, ( + "Computed blocks should be empty when " + "prefix caching is disabled") + + # Append the new computed blocks to the request blocks until now to + # avoid the case where the new blocks cannot be allocated. + req_blocks.extend(new_computed_blocks) + + # Start to handle new blocks + if num_new_blocks <= 0: # No new block is needed. new_blocks = [] @@ -160,112 +205,29 @@ class KVCacheManager: ) assert num_new_blocks > 0 + # Concatenate the computed block IDs and the new block IDs. new_blocks = self._get_new_blocks(num_new_blocks) req_blocks.extend(new_blocks) if not self.enable_caching: return new_blocks - num_computed_full_blocks = (request.num_computed_tokens // - self.block_size) - # NOTE(rickyx): We are assuming the `num_tokens` are actual # tokens rather than lookahead slots (e.g. for speculative decoding). # TODO(rickyx): When supporting speculative decoding, we will need to # differentiate between them so that we can know how many blocks are # full after appending the actual tokens. - num_full_blocks_after_append = (request.num_computed_tokens + - num_tokens) // self.block_size - assert num_full_blocks_after_append <= len(req_blocks) - - new_full_blocks = req_blocks[ - num_computed_full_blocks:num_full_blocks_after_append] - if new_full_blocks: - self._cache_full_blocks( - request=request, - blk_start_idx=num_computed_full_blocks, - full_blocks=new_full_blocks, - prev_block=req_blocks[num_computed_full_blocks - 1] - if num_computed_full_blocks >= 1 else None, - ) - - return new_blocks - - def allocate_slots( - self, - request: Request, - num_tokens: int, - computed_blocks: List[KVCacheBlock], - ) -> Optional[List[KVCacheBlock]]: - """Allocate slots for a new request. - - Args: - request: The request to allocate slots. - num_tokens: The number of tokens to allocate. Note that this does - not include the tokens that have already been computed. - computed_blocks: A list of computed blocks. - - Returns: - A list of new allocated blocks. - """ - if num_tokens == 0: - raise ValueError( - f"num_tokens must be greater than 0, got {num_tokens}") - - # If a computed block of a request is an eviction candidate (in the - # free queue and ref_cnt == 0), it cannot be counted as a free block - # when allocating this request. - num_evictable_computed_blocks = sum(1 for blk in computed_blocks - if blk.ref_cnt == 0) - - num_required_blocks = cdiv(num_tokens, self.block_size) - if (num_required_blocks > self.free_block_queue.num_free_blocks - - num_evictable_computed_blocks): - # Cannot allocate new blocks. - return None - - # Touch the computed blocks to make sure they won't be evicted. - if self.enable_caching: - self._touch(computed_blocks) - else: - assert not computed_blocks, ( - "Computed blocks should be empty when " - "prefix caching is disabled") - - # Determine the number of new blocks to allocate considering - # preallocated blocks. - num_new_blocks = min( - num_required_blocks + self.num_preallocate_blocks, - self.free_block_queue.num_free_blocks, - # Should not exceed the maximum number of blocks per request. - # This is especially because the block table has the shape - # [..., max_num_blocks_per_req]. - # TODO(woosuk): Check and reject requests if - # num_prompt_tokens + max_tokens > max_model_len. - self.max_num_blocks_per_req - len(computed_blocks), - ) - assert num_new_blocks > 0 - - # Concatenate the computed block IDs and the new block IDs. - new_blocks = self._get_new_blocks(num_new_blocks) - self.req_to_blocks[request.request_id] = computed_blocks + new_blocks - - if not self.enable_caching: - return new_blocks - - num_computed_tokens = len(computed_blocks) * self.block_size num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size - - new_full_blocks = self.req_to_blocks[ - request.request_id][len(computed_blocks):num_full_blocks] + num_computed_full_blocks = num_computed_tokens // self.block_size + new_full_blocks = req_blocks[num_computed_full_blocks:num_full_blocks] if new_full_blocks: self._cache_full_blocks( request=request, - blk_start_idx=len(computed_blocks), + blk_start_idx=num_computed_full_blocks, # The new full blocks are the full blocks that are not computed. full_blocks=new_full_blocks, - prev_block=computed_blocks[-1] if computed_blocks else None, - ) + prev_block=(req_blocks[num_computed_full_blocks - 1] + if num_computed_full_blocks > 0 else None)) return new_blocks @@ -290,29 +252,6 @@ class KVCacheManager: if block.ref_cnt == 0: self.free_block_queue.append(block) - def uncache_blocks(self, request: Request) -> int: - """Uncache the blocks that are no longer full based on the - num_computed_tokens in the given request. This happens when - the blocks were full and cached due to speculative tokens, but the - speculative tokens are not accepted. - - Args: - request: The request. - - Returns: - The number of uncached blocks. - """ - blocks = self.req_to_blocks[request.request_id] - num_computed_tokens = request.num_computed_tokens - num_full_blocks = num_computed_tokens // self.block_size - num_uncached_blocks = 0 - for block in blocks[num_full_blocks:]: - # If the block is not cached, the following blocks are not cached. - if not self._maybe_evict_cached_block(block): - break - num_uncached_blocks += 1 - return num_uncached_blocks - def reset_prefix_cache(self) -> bool: """Reset prefix cache. This function may be used in RLHF flows to invalid prefix caching after the weights are updated, @@ -508,8 +447,22 @@ class KVCacheManager: assert prev_block.block_hash is not None prev_block_hash_value = prev_block.block_hash.hash_value - for i, blk in enumerate(full_blocks): - blk_idx = blk_start_idx + i + # Find the first uncached block. This case should only happen when + # speculative decoding is used. + offset = 0 + for blk in full_blocks: + if blk.block_hash is None: + break + else: + prev_block_hash_value = blk.block_hash.hash_value + offset += 1 + else: + # All blocks are cached. + return + + for i, blk in enumerate(full_blocks[offset:]): + blk_idx = blk_start_idx + offset + i + assert blk.block_hash is None if blk_idx < num_cached_block_hashes: # The block hash may already be computed in diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 2b6557ad3ce66350c17e03377510d57f2a7b462b..e0976ba8577b9ac4e02037c79d7e0914a6fc9675 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """KV-Cache Utilities.""" from collections.abc import Sequence from dataclasses import dataclass @@ -262,6 +263,15 @@ def hash_block_tokens( The hash value of the block and the token ids in the block. The entire tuple is used as the hash key of the block. """ + if not parent_block_hash: + # Note that we use 'None' as a string here instead of None because + # as of Python 3.12, hash(None) returns a constant predictable value. + # This could possibly make it easier to find and exploit hash + # collisions. 'None' as a string will be hashed differently per process, + # but consistently within the same process. This is the same as the + # behavior of None prior to Python 3.12. + parent_block_hash = hash('None') + curr_block_token_ids_tuple = tuple(curr_block_token_ids) return BlockHashType( hash((parent_block_hash, curr_block_token_ids_tuple, extra_keys)), diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 910fc4ff4d2b657c01dcd7fe8b09287e847e14cf..fb5e83fe062747ee85abf83243938c3763c1a2eb 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import deque from dataclasses import dataclass from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set, @@ -65,10 +67,10 @@ class Scheduler: # This is flushed at the end of each scheduling step. self.finished_req_ids: Set[str] = set() - # OPTIMIZATION: Cache the RunningRequestData objects to avoid creating + # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating # them at each scheduling step. - # Request id -> RunningRequestData - self.running_reqs_data: Dict[str, RunningRequestData] = {} + # Request id -> CachedRequestData + self._cached_reqs_data: Dict[str, CachedRequestData] = {} # Encoder-related. # Calculate encoder cache size if applicable @@ -113,17 +115,8 @@ class Scheduler: encoder_budget = self.max_num_encoder_input_tokens # First, schedule the RUNNING requests. - # NOTE(woosuk): At most 1 request in the RUNNING queue is allowed to be - # in the "partial" state, where the request has some tokens computed - # but not all. The constraint is due to the persistent batch in the - # V1 model runner. - # TODO(woosuk): Remove this constraint after refactoring model runner. - has_partial_request = False req_index = 0 - while req_index < len(self.running): - # Only the last request in the RUNNING queue can be "partial". - assert not has_partial_request - assert token_budget > 0 + while req_index < len(self.running) and token_budget > 0: request = self.running[req_index] num_new_tokens = request.num_tokens - request.num_computed_tokens num_new_tokens = min(num_new_tokens, token_budget) @@ -135,10 +128,17 @@ class Scheduler: request.num_computed_tokens, num_new_tokens, encoder_budget)) - assert num_new_tokens > 0 + if num_new_tokens == 0: + # The request cannot be scheduled because the encoder budget + # or the encoder cache is exhausted. + # NOTE(woosuk): Here, by doing `continue` instead of `break`, + # we do not strictly follow the FCFS scheduling policy and + # allow the lower-priority requests to be scheduled. + req_index += 1 + continue while True: - new_blocks = self.kv_cache_manager.append_slots( + new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens) if new_blocks is None: # The request cannot be scheduled. @@ -170,8 +170,6 @@ class Scheduler: num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens req_index += 1 - has_partial_request = (request.num_computed_tokens + num_new_tokens - < request.num_tokens) # Encoder-related. if encoder_inputs_to_schedule: @@ -184,13 +182,9 @@ class Scheduler: # Next, schedule the WAITING requests. if not preempted_reqs: - while self.waiting: - if has_partial_request: - break + while self.waiting and token_budget > 0: if len(self.running) == self.max_num_running_reqs: break - if token_budget == 0: - break request = self.waiting[0] # Get already-cached tokens. @@ -247,8 +241,6 @@ class Scheduler: token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - has_partial_request = (num_computed_tokens + num_new_tokens - < request.num_tokens) # Encoder-related. if encoder_inputs_to_schedule: @@ -264,8 +256,11 @@ class Scheduler: assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens assert token_budget >= 0 assert len(self.running) <= self.max_num_running_reqs + # Since some requests in the RUNNING queue may not be scheduled in + # this step, the total number of scheduled requests can be smaller than + # len(self.running). assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + - len(scheduled_running_reqs) == len(self.running)) + len(scheduled_running_reqs) <= len(self.running)) # Get the longest common prefix among all requests in the running queue. # This can be potentially used for cascade attention. @@ -284,25 +279,28 @@ class Scheduler: for req in scheduled_new_reqs ] resumed_reqs_data = [ - ResumedRequestData.from_request( - req, req_to_new_block_ids[req.request_id], - req.num_computed_tokens) for req in scheduled_resumed_reqs + self._make_cached_request_data( + req, + req_to_new_block_ids[req.request_id], + req.num_computed_tokens, + resumed_from_preemption=True, + ) for req in scheduled_resumed_reqs ] running_reqs_data = [ - self._make_running_request_data( - req, req_to_new_block_ids[req.request_id], - req.num_computed_tokens) for req in scheduled_running_reqs + self._make_cached_request_data( + req, + req_to_new_block_ids[req.request_id], + req.num_computed_tokens, + resumed_from_preemption=False, + ) for req in scheduled_running_reqs ] - preempted_req_ids = {req.request_id for req in preempted_reqs} scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, - scheduled_resumed_reqs=resumed_reqs_data, - scheduled_running_reqs=running_reqs_data, + scheduled_cached_reqs=resumed_reqs_data + running_reqs_data, num_scheduled_tokens=num_scheduled_tokens, total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_encoder_inputs=scheduled_encoder_inputs, num_common_prefix_blocks=num_common_prefix_blocks, - preempted_req_ids=preempted_req_ids, # finished_req_ids is an existing state in the scheduler, # instead of being newly scheduled in this step. # It contains the request IDs that are finished in between @@ -314,22 +312,26 @@ class Scheduler: self.finished_req_ids = set() return scheduler_output - def _make_running_request_data( + def _make_cached_request_data( self, request: Request, new_block_ids: List[int], num_computed_tokens: int, - ) -> "RunningRequestData": - # OPTIMIZATION: Cache the RunningRequestData objects to avoid creating + resumed_from_preemption: bool, + ) -> "CachedRequestData": + # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating # them at each scheduling step. - if request.request_id in self.running_reqs_data: - req_data = self.running_reqs_data[request.request_id] + if request.request_id in self._cached_reqs_data: + req_data = self._cached_reqs_data[request.request_id] + req_data.resumed_from_preemption = resumed_from_preemption req_data.new_block_ids = new_block_ids req_data.num_computed_tokens = num_computed_tokens else: - req_data = RunningRequestData.from_request(request, new_block_ids, - num_computed_tokens) - self.running_reqs_data[request.request_id] = req_data + req_data = CachedRequestData.from_request(request, + resumed_from_preemption, + new_block_ids, + num_computed_tokens) + self._cached_reqs_data[request.request_id] = req_data return req_data def _try_schedule_encoder_inputs( @@ -418,7 +420,13 @@ class Scheduler: # expensive operations inside the loop. for request in self.running: req_id = request.request_id - request.num_computed_tokens += num_scheduled_tokens[req_id] + num_tokens_scheduled = num_scheduled_tokens.get(req_id, 0) + if num_tokens_scheduled == 0: + # The request was not scheduled in this step. + new_running.append(request) + continue + + request.num_computed_tokens += num_tokens_scheduled # When the request's num_computed_tokens catches up its num_tokens, # the request generates output tokens. Otherwise, we ignore the # sampler output for the request. @@ -527,7 +535,7 @@ class Scheduler: assert request.is_finished() self.kv_cache_manager.free(request) self.encoder_cache_manager.free(request) - self.running_reqs_data.pop(request.request_id, None) + self._cached_reqs_data.pop(request.request_id, None) del self.requests[request.request_id] self.finished_req_ids.add(request.request_id) @@ -582,30 +590,13 @@ class NewRequestData: @dataclass -class ResumedRequestData: - - req_id: str - block_ids: List[int] - num_computed_tokens: int - - @classmethod - def from_request( - cls, - request: Request, - block_ids: List[int], - num_computed_tokens: int, - ) -> "ResumedRequestData": - return cls( - req_id=request.request_id, - block_ids=block_ids, - num_computed_tokens=num_computed_tokens, - ) - - -@dataclass -class RunningRequestData: +class CachedRequestData: req_id: str + # If resumed_from_preemption is False, new_block_ids will be appended to + # the request's block IDs. If True, new_block_ids will be used as the + # request's block IDs instead of appending to the existing block IDs. + resumed_from_preemption: bool new_block_ids: List[int] num_computed_tokens: int @@ -613,11 +604,13 @@ class RunningRequestData: def from_request( cls, request: Request, + resumed_from_preemption: bool, new_block_ids: List[int], num_computed_tokens: int, - ) -> "RunningRequestData": + ) -> "CachedRequestData": return cls( req_id=request.request_id, + resumed_from_preemption=resumed_from_preemption, new_block_ids=new_block_ids, num_computed_tokens=num_computed_tokens, ) @@ -627,14 +620,12 @@ class RunningRequestData: class SchedulerOutput: scheduled_new_reqs: List[NewRequestData] - scheduled_resumed_reqs: List[ResumedRequestData] - scheduled_running_reqs: List[RunningRequestData] + scheduled_cached_reqs: List[CachedRequestData] num_scheduled_tokens: Dict[str, int] total_num_scheduled_tokens: int scheduled_encoder_inputs: Dict[str, List[int]] num_common_prefix_blocks: int - preempted_req_ids: Set[str] finished_req_ids: Set[str] free_encoder_input_ids: List[Tuple[str, int]] diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index abe4952c4baff8928a1a13acfd682c88f494b27f..d5933cac50c202e2386b399520ee419450188ac5 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum from dataclasses import dataclass from typing import TYPE_CHECKING, List, Optional, Union @@ -12,6 +14,29 @@ if TYPE_CHECKING: from vllm.multimodal.inputs import PlaceholderRange from vllm.sampling_params import SamplingParams +# These are possible values of RequestOutput.finish_reason, +# so form part of the external API. +FINISH_REASON_STRINGS = ("stop", "length", "abort") + + +class FinishReason(enum.IntEnum): + """ + Reason a request finished - stop, length, or abort. + + Int rather than Str for more compact serialization. + + stop - a stop string was emitted + length - max_tokens was consumed, or max_model_len was reached + abort - aborted for another reason + + """ + STOP = 0 + LENGTH = 1 + ABORT = 2 + + def __str__(self): + return FINISH_REASON_STRINGS[self.value] + @dataclass class EngineCoreRequest: @@ -43,7 +68,7 @@ class EngineCoreOutput( request_id: str new_token_ids: List[int] finished: bool - finish_reason: Optional[str] = None + finish_reason: Optional[FinishReason] = None stop_reason: Union[int, str, None] = None @@ -54,7 +79,7 @@ class EngineCoreOutputs( gc=False): # type: ignore[call-arg] #NOTE(Nick): We could consider ways to make this more compact, - # e.g. columnwise layout and using an int enum for finish/stop reason + # e.g. columnwise layout # [num_reqs] outputs: List[EngineCoreOutput] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b9dc3561d1750e83849ba661e0b50c69ad0c4e2b..3c4e35e4aa2749dfbb65eb049d77c68ebcced146 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os from typing import AsyncGenerator, List, Mapping, Optional, Type, Union diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f50303bda58fd79552f7d25326f392e0f7bbd44f..29a9ac1868f27869a0c80de847adb875f56197f1 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle import queue import signal diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index f3b992d6873e7334bfa7b8dc4a9c83a319486a34..247380ef7cfedae1986602ef59abd91192ab0226 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio import os import signal diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 4a8b61beec037f2901150759cd085e17ee72c883..861fcb012c34ecb918c7e044dbda3cbb5068ceae 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import List, Optional, Union @@ -6,7 +8,7 @@ from vllm.logger import init_logger from vllm.sampling_params import RequestOutputKind from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) -from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason logger = init_logger(__name__) @@ -16,7 +18,7 @@ class DetokenizerOutput: output_text: str token_ids: List[int] finished: bool - finish_reason: Optional[str] = None + finish_reason: Optional[FinishReason] = None stop_reason: Union[int, str, None] = None @@ -145,13 +147,13 @@ class IncrementalDetokenizer: stop_str, truncate_to = stop if truncate_to != -1: self.output_text = self.output_text[:truncate_to] - finish_reason = "stop" # TODO: use constant + finish_reason = FinishReason.STOP stop_reason = stop_str # TODO: handle stop_token_ids here too? # 3) Update the RequestOutput object with the new text. - finished = bool(finish_reason) + finished = finish_reason is not None if self.output_kind == RequestOutputKind.FINAL_ONLY \ and not finished: return None diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 55d314ebeb95517d15b5a049907042fb223ef1d3..e0452bcad7ba7e1afc560230dd183ece5fe47a90 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict, List, Mapping, Optional, Type, Union from typing_extensions import TypeVar diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py index d83460a40ad269e513a75b2200c16cb0d3e6ffef..83a0d9db161d2c0e11543fe60d8f5f29d23cb7aa 100644 --- a/vllm/v1/engine/mm_input_mapper.py +++ b/vllm/v1/engine/mm_input_mapper.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Any, Dict, List, Optional from vllm.config import ModelConfig diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 234ef8194ca93bddc4baf675acec2af572bddaab..947366691471784dd08462190f58767a10276630 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import asyncio from dataclasses import dataclass from typing import Dict, List, Optional @@ -159,8 +161,10 @@ class OutputProcessor: engine_core_output) # 3) Create and handle RequestOutput objects. - if request_output := self._make_request_output( - req_state, detokenizer_output): + if detokenizer_output is not None: + request_output = self._make_request_output( + req_state, detokenizer_output) + if req_state.queue is not None: # AsyncLLM: put into queue for handling by generate(). req_state.queue.put_nowait(request_output) @@ -170,6 +174,8 @@ class OutputProcessor: # Free completed requests. if request_output.finished: + assert detokenizer_output.finish_reason is not None + self.request_states.pop(req_id) if not engine_core_output.finished: # If req not finished in EngineCore, but Detokenizer @@ -178,7 +184,8 @@ class OutputProcessor: # Track per-request stats iteration_stats.update_from_finished_request( - request_output, req_state.stats) + detokenizer_output.finish_reason, request_output, + req_state.stats) return OutputProcessorOutput( request_outputs=request_outputs, @@ -189,12 +196,8 @@ class OutputProcessor: @staticmethod def _make_request_output( request_state: RequestState, - detokenizer_output: Optional[DetokenizerOutput], - ) -> Optional[RequestOutput]: - - if detokenizer_output is None: - return None - + detokenizer_output: DetokenizerOutput, + ) -> RequestOutput: request_output = RequestOutput.new( request_state.request_id, request_state.prompt, @@ -205,7 +208,8 @@ class OutputProcessor: ) if detokenizer_output.finished: completion_output = request_output.outputs[0] - completion_output.finish_reason = detokenizer_output.finish_reason + completion_output.finish_reason = str( + detokenizer_output.finish_reason) completion_output.stop_reason = detokenizer_output.stop_reason return request_output diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 6196c110520781eb33ce3ba250c6e25ce71e80d6..366287951ed0444d5130d60e63d7e49fb38e6e15 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from typing import Mapping, Optional, Union diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 131be759842c7bd9c095cb3c573df7f76f0bdf89..ac10d43eb0d54d634af49964234cdcdc00c3c8d9 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Type from vllm.config import VllmConfig diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index f6cf35da0106b63a8f42f887fb2605576c3a2fe6..e3f07172d8cd9bc280740c1d92caa2e6ca8f0607 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os import pickle import signal diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 6d5cc32ffc5b8035dc63413f9ec6d1b7cd96113f..eddfb5949ebe65c3dd5f8ae72a8aad06ee818703 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Dict, List diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index f901822c7887c95f24426363584bc29bd7bf9be3..eb1acf584c6b02e19f9ca6e41c58f5571bc310cc 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1,12 +1,15 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from abc import ABC, abstractmethod -from typing import List +from typing import Dict, List import numpy as np import prometheus_client from vllm.config import ModelConfig from vllm.logger import init_logger +from vllm.v1.engine import FinishReason from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -114,6 +117,17 @@ class PrometheusStatLogger(StatLoggerBase): documentation="Number of generation tokens processed.", labelnames=labelnames).labels(*labelvalues) + self.counter_request_success: Dict[FinishReason, + prometheus_client.Counter] = {} + counter_request_success_base = prometheus_client.Counter( + name="vllm:request_success_total", + documentation="Count of successfully processed requests.", + labelnames=labelnames + ["finished_reason"]) + for reason in FinishReason: + self.counter_request_success[ + reason] = counter_request_success_base.labels(*(labelvalues + + [str(reason)])) + self.histogram_num_prompt_tokens_request = \ prometheus_client.Histogram( name="vllm:request_prompt_tokens", @@ -161,6 +175,7 @@ class PrometheusStatLogger(StatLoggerBase): iteration_stats.num_generation_tokens) for finished_request in iteration_stats.finished_requests: + self.counter_request_success[finished_request.finish_reason].inc() self.histogram_num_prompt_tokens_request.observe( finished_request.num_prompt_tokens) self.histogram_num_generation_tokens_request.observe( diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 5277505128a630b5eaa203058e21d890093eac66..e3f1efcc9b1a7547405642e87097c450eac86363 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,10 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from dataclasses import dataclass from typing import TYPE_CHECKING, List if TYPE_CHECKING: from vllm.outputs import RequestOutput - from vllm.v1.engine import EngineCoreOutput + from vllm.v1.engine import EngineCoreOutput, FinishReason @dataclass @@ -30,6 +32,7 @@ class RequestStateStats: class FinishedRequestStats: """Stats associated with a finished request.""" + finish_reason: "FinishReason" num_prompt_tokens: int = 0 num_generation_tokens: int = 0 @@ -71,8 +74,10 @@ class IterationStats: request_state_stats.num_generation_tokens += num_new_generation_tokens request_state_stats.last_token_time = now - def update_from_finished_request(self, request_output: "RequestOutput", + def update_from_finished_request(self, finish_reason: "FinishReason", + request_output: "RequestOutput", request_state_stats: RequestStateStats): self.finished_requests.append( - FinishedRequestStats(len(request_output.prompt_token_ids), + FinishedRequestStats(finish_reason, + len(request_output.prompt_token_ids), request_state_stats.num_generation_tokens)) diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 32aee44e3f374723cf33499cd7e85aae2a5b6fde..6e82bffd7e5c9dfff0a077ecb9c34a3cad4c9c53 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Dict, List, Optional diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 2cfcd8b63ccb28570660a2efee5c945c7d7657ca..89b39ea615d20f6a1cfb3c3b91e2d211008a85ba 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -1,10 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum from typing import TYPE_CHECKING, List, Optional, Union from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams from vllm.sequence import RequestMetrics -from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine import EngineCoreRequest, FinishReason from vllm.v1.utils import ConstantList if TYPE_CHECKING: @@ -64,6 +66,7 @@ class Request: # Cache the computed kv block hashes of the request to avoid # recomputing. self._kv_block_hashes: List[BlockHashType] = [] + self.kv_block_hashes = ConstantList(self._kv_block_hashes) # Read-only views # Prevent directly appending to the these lists since @@ -106,7 +109,7 @@ class Request: def is_finished(self) -> bool: return RequestStatus.is_finished(self.status) - def get_finished_reason(self) -> Union[str, None]: + def get_finished_reason(self) -> Union[FinishReason, None]: return RequestStatus.get_finished_reason(self.status) def has_encoder_inputs(self) -> bool: @@ -121,13 +124,9 @@ class Request: num_tokens = self.mm_positions[input_id]["length"] return num_tokens - @property - def kv_block_hashes(self) -> ConstantList["BlockHashType"]: - # Prevent directly appending to the kv_block_hashes. - return ConstantList(self._kv_block_hashes) - def set_kv_block_hashes(self, value: List["BlockHashType"]) -> None: self._kv_block_hashes = value + self.kv_block_hashes = ConstantList(self._kv_block_hashes) def append_kv_block_hashes(self, block_hash: "BlockHashType") -> None: self._kv_block_hashes.append(block_hash) @@ -150,7 +149,8 @@ class RequestStatus(enum.IntEnum): return status > RequestStatus.PREEMPTED @staticmethod - def get_finished_reason(status: "RequestStatus") -> Union[str, None]: + def get_finished_reason( + status: "RequestStatus") -> Union[FinishReason, None]: return _FINISHED_REASON_MAP.get(status) @@ -159,8 +159,8 @@ class RequestStatus(enum.IntEnum): # are longer than the model's length cap. Therefore, the stop # reason should also be "length" as in OpenAI API. _FINISHED_REASON_MAP = { - RequestStatus.FINISHED_STOPPED: "stop", - RequestStatus.FINISHED_LENGTH_CAPPED: "length", - RequestStatus.FINISHED_ABORTED: "abort", - RequestStatus.FINISHED_IGNORED: "length", + RequestStatus.FINISHED_STOPPED: FinishReason.STOP, + RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH, + RequestStatus.FINISHED_ABORTED: FinishReason.ABORT, + RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH, } diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index d60f7eb5d76f9babacbb6c2685ff3ec89313b108..8e54de34548ddfe8a631d78979b91c83dffc2e9f 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from dataclasses import dataclass from typing import Dict, List, Optional, Set diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py index 2796d049457d007916484add047b830a4074d3c0..ba368b44ab9cc02c8cb281049f84bfd66705081a 100644 --- a/vllm/v1/sample/ops/penalties.py +++ b/vllm/v1/sample/ops/penalties.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List, Set, Tuple import torch diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index f2007d85c61a545c47af29f6d3ce1bca70456d40..27431001e3e7a2f0c78baa9d6c20900bb107fe3a 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import Dict import torch diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 9ad665a64894c589dfc280c8a133f62c4664381d..3da7498e0dae5d671e81b9fcace11ed992a8478d 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A layer that samples the next tokens from the model's outputs.""" from typing import Tuple diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index b1cd5c11834f8db63eb32190e1e65c04f22299e2..1791dfa2b6325f2f41c34cd68fa86152aa9a7c06 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import pickle diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py index 902800e0573bf0aaba48ca176dcf172bce6ad62d..09d382638bffd881c9dbe3ef5ec5a55c6fb17d7d 100644 --- a/vllm/v1/stats/common.py +++ b/vllm/v1/stats/common.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import time from dataclasses import dataclass from dataclasses import field as dataclass_field diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 8dfcf2dd78606b4899ff022627fbe6222891c705..5494542c181d7843db9cbdf9051a1ad55229ae9f 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import multiprocessing import os import weakref diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 26a2084b131fa31fc8698a76332b97f459b0ff33..f520ee9586c5c909a303d9470c8f16627480e68b 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import List import numpy as np @@ -44,6 +46,8 @@ class BlockTable: start: int, block_ids: List[int], ) -> None: + if not block_ids: + return num_blocks = len(block_ids) self.block_table_np[row_idx, start:start + num_blocks] = block_ids self.num_blocks_per_row[row_idx] = start + num_blocks diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 28d8e390538747bdbc0239fe4134d12be86d42a7..39708f833fd58340a160eef20150c977ce17506f 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + # Datastructures defining an input batch from dataclasses import dataclass diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a00c00c307335c5004e18341deba4bf37f1eebfc..ec6d04cd497527776e8e3bcb3b0fb9677b872cb0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import gc import time from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast @@ -203,12 +205,32 @@ class GPUModelRunner: pin_memory=self.pin_memory) self.seq_lens_np = self.seq_lens_cpu.numpy() - def _update_states(self, scheduler_output: "SchedulerOutput") -> None: - # Remove stopped requests from the cached states. - # Keep the states of the preempted requests. + def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: + """Update the cached states and the persistent batch with the scheduler + output. + + The updated states are used by the `_prepare_inputs` function to create + the input GPU tensors for the model. + + Returns: + True if there is a new/resumed/paused/finished request in the batch. + If False, we can skip copying SamplingMetadata to the GPU. + """ + # Remove finished requests from the cached states. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) self.encoder_cache.pop(req_id, None) + # Remove the finished requests from the persistent batch. + # NOTE(woosuk): There could be an edge case where finished_req_ids and + # scheduled_req_ids overlap. This happens when a request is aborted and + # then resubmitted with the same ID. In this case, we treat them as two + # distinct requests - clearing the cached states for the first request + # and handling the second as a new request. + removed_req_indices: List[int] = [] + for req_id in scheduler_output.finished_req_ids: + req_index = self.input_batch.remove_request(req_id) + if req_index is not None: + removed_req_indices.append(req_index) # Free the cached encoder outputs. for req_id, input_id in scheduler_output.free_encoder_input_ids: @@ -218,36 +240,22 @@ class GPUModelRunner: if not encoder_outputs: self.encoder_cache.pop(req_id, None) - # Remove the requests from the persistent batch. - stopped_req_ids = set().union( - scheduler_output.preempted_req_ids, - scheduler_output.finished_req_ids, - ) - removed_req_indices: List[int] = [] - for req_id in stopped_req_ids: + # Remove the unscheduled requests from the persistent batch. + # NOTE(woosuk): The unscheduled requests are either preempted requests + # or running requests that are not scheduled in this step. We remove + # them from the persistent batch but keep their cached states since + # they will be scheduled again sometime in the future. + scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys() + cached_req_ids = self.input_batch.req_id_to_index.keys() + unscheduled_req_ids = cached_req_ids - scheduled_req_ids + # NOTE(woosuk): The persistent batch optimization assumes that + # consecutive batches contain mostly the same requests. If batches + # have low request overlap (e.g., alternating between two distinct + # sets of requests), this optimization becomes very inefficient. + for req_id in unscheduled_req_ids: req_index = self.input_batch.remove_request(req_id) - if req_index is not None: - removed_req_indices.append(req_index) - - # Update the states of the running requests. - for req_data in scheduler_output.scheduled_running_reqs: - req_id = req_data.req_id - req_state = self.requests[req_id] - req_index = self.input_batch.req_id_to_index[req_id] - - # Update the num_computed_tokens. - req_state.num_computed_tokens = req_data.num_computed_tokens - self.input_batch.num_computed_tokens_cpu[req_index] = ( - req_data.num_computed_tokens) - - # Update the block table. - num_new_blocks = len(req_data.new_block_ids) - if num_new_blocks == 0: - continue - start_index = len(req_state.block_ids) - req_state.block_ids.extend(req_data.new_block_ids) - self.input_batch.block_table.append_row(req_index, start_index, - req_data.new_block_ids) + assert req_index is not None + removed_req_indices.append(req_index) req_ids_to_add: List[str] = [] # Add new requests to the cached states. @@ -277,6 +285,7 @@ class GPUModelRunner: if self.model_config.uses_mrope: image_grid_thw = [] video_grid_thw = [] + second_per_grid_ts = [] for mm_input in self.requests[req_id].mm_inputs: if mm_input.get("image_grid_thw") is not None: image_grid_thw.extend( @@ -284,6 +293,9 @@ class GPUModelRunner: if mm_input.get("video_grid_thw") is not None: video_grid_thw.extend( mm_input["video_grid_thw"].tolist()) + if mm_input.get("second_per_grid_ts") is not None: + second_per_grid_ts.extend( + mm_input["second_per_grid_ts"]) hf_config = self.model_config.hf_config @@ -291,26 +303,44 @@ class GPUModelRunner: self.requests[req_id].mrope_position_delta = \ MRotaryEmbedding.get_input_positions_tensor( self.requests[req_id].prompt_token_ids, + hf_config=hf_config, image_grid_thw=image_grid_thw, video_grid_thw=video_grid_thw, - image_token_id=hf_config.image_token_id, - video_token_id=hf_config.video_token_id, - vision_start_token_id=hf_config.vision_start_token_id, - vision_end_token_id=hf_config.vision_end_token_id, - spatial_merge_size=hf_config.vision_config. - spatial_merge_size, + second_per_grid_ts=second_per_grid_ts, ) req_ids_to_add.append(req_id) - # Update the cached states of the resumed requests. - for res_req_data in scheduler_output.scheduled_resumed_reqs: - req_id = res_req_data.req_id + # Update the states of the running/resumed requests. + for req_data in scheduler_output.scheduled_cached_reqs: + req_id = req_data.req_id req_state = self.requests[req_id] - req_state.block_ids = res_req_data.block_ids - req_state.num_computed_tokens = res_req_data.num_computed_tokens - req_ids_to_add.append(req_id) + # Update the cached states. + req_state.num_computed_tokens = req_data.num_computed_tokens + if not req_data.resumed_from_preemption: + # Append the new blocks to the existing block IDs. + req_state.block_ids.extend(req_data.new_block_ids) + else: + # The request is resumed from preemption. + # Replace the existing block IDs with the new ones. + req_state.block_ids = req_data.new_block_ids + + req_index = self.input_batch.req_id_to_index.get(req_id) + if req_index is None: + # The request is not in the persistent batch. + # The request was either preempted and resumed later, or was not + # scheduled in the previous step and needs to be added again. + req_ids_to_add.append(req_id) + continue + + # Update the persistent batch. + self.input_batch.num_computed_tokens_cpu[req_index] = ( + req_data.num_computed_tokens) + start_index = len(req_state.block_ids) - len( + req_data.new_block_ids) + self.input_batch.block_table.append_row(req_index, start_index, + req_data.new_block_ids) # Add the new or resumed requests to the persistent batch. # The smaller empty indices are filled first. @@ -328,6 +358,7 @@ class GPUModelRunner: # Condense the batched states if there are empty indices. if removed_req_indices: self.input_batch.condense(removed_req_indices) + return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0 def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens @@ -534,10 +565,10 @@ class GPUModelRunner: prefix_kv_lens=prefix_kv_lens, suffix_kv_lens=suffix_kv_lens, ) - # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial - # request in the batch. While we should not sample any token from this - # partial request, we do so for simplicity. We will ignore the sampled - # token from the partial request. + # NOTE(woosuk): Due to chunked prefills, the batch may contain partial + # requests. While we should not sample any token from these partial + # requests, we do so for simplicity. We will ignore the sampled + # tokens from the partial requests. # TODO: Support prompt logprobs. logits_indices = query_start_loc[1:] - 1 return attn_metadata, logits_indices @@ -599,22 +630,15 @@ class GPUModelRunner: def _prepare_sampling( self, - scheduler_output: "SchedulerOutput", + batch_changed: bool, ) -> SamplingMetadata: - skip_copy = True - if (scheduler_output.finished_req_ids - or scheduler_output.preempted_req_ids): - skip_copy = False - if (scheduler_output.scheduled_new_reqs - or scheduler_output.scheduled_resumed_reqs): - skip_copy = False # Create the sampling metadata. req_id_output_token_ids: Dict[str, List[int]] = \ {req_id: req.output_token_ids \ for req_id, req in self.requests.items()} sampling_metadata = self.input_batch.make_sampling_metadata( - req_id_output_token_ids, skip_copy) + req_id_output_token_ids, skip_copy=not batch_changed) return sampling_metadata def _execute_encoder(self, scheduler_output: "SchedulerOutput"): @@ -713,7 +737,7 @@ class GPUModelRunner: self, scheduler_output: "SchedulerOutput", ) -> ModelRunnerOutput: - self._update_states(scheduler_output) + batch_changed = self._update_states(scheduler_output) if self.is_multimodal_model: # Run the multimodal encoder if any. @@ -776,7 +800,7 @@ class GPUModelRunner: logits = self.model.compute_logits(hidden_states, None) # Sample the next token and get logprobs if needed. - sampling_metadata = self._prepare_sampling(scheduler_output) + sampling_metadata = self._prepare_sampling(batch_changed) sampler_output = self.model.sample( logits=logits, sampling_metadata=sampling_metadata, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index a8cf0aec3f17b569444686eff81cfc7688501afa..0adb69073397c9fde95d7d6c88fc6314e9af7b28 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A GPU worker class.""" import gc import os diff --git a/vllm/version.py b/vllm/version.py index 66e189dcedf7129c08a2d5ff29367e61dc94b051..70cd0289b441f8e8dc169e1e269d6fe90f44eb27 100644 --- a/vllm/version.py +++ b/vllm/version.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + try: from ._version import __version__, __version_tuple__ except Exception as e: diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 7eba1963394aa653f9f71be903ed293e139f86f5..a81fa89e35aaaf7164bda94cff2b1410d2af1f4c 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -1,13 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 """CacheEngine class for managing the KV cache.""" from typing import List +import numpy as np import torch +from vllm import envs from vllm.attention import get_attn_backend from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, - get_dtype_size, is_pin_memory_available) + align_to_256bytes, get_dtype_size, + is_pin_memory_available) from vllm.attention.backends.tree_decoding_utils import move_cache logger = init_logger(__name__) @@ -38,6 +43,7 @@ class CacheEngine: self.num_attention_layers = model_config.get_num_layers_by_block_type( parallel_config, LayerBlockType.attention) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) + self.align_cache = self._align_cache(model_config) self.block_size = cache_config.block_size self.num_gpu_blocks = cache_config.num_gpu_blocks @@ -75,15 +81,39 @@ class CacheEngine: num_blocks, self.block_size, self.num_kv_heads, self.head_size) pin_memory = is_pin_memory_available() if device == "cpu" else False kv_cache: List[torch.Tensor] = [] + + # Align entries so they are 256 byte aligned for better performance + # Primarily targets MLA as this typically only ends up having entries + # be 128 byte aligned. + if self.align_cache: + # We assume the cache shape is: + # (TOTAL_PAGES, PAGE_SIZE, entry_shape...) + # NOTE this assumption currently only holds for MLA so we only apply + # this optimization when `use_mla` is true + entry_shape = kv_cache_shape[2:] + entry_size = np.prod(entry_shape) + alloc_entry_size = align_to_256bytes(entry_size, self.dtype) + alloc_shape = (*kv_cache_shape[:2], alloc_entry_size) + else: + alloc_shape = kv_cache_shape + for _ in range(self.num_attention_layers): # null block in CpuGpuBlockAllocator requires at least that # block to be zeroed-out. # We zero-out everything for simplicity. - kv_cache.append( - torch.zeros(kv_cache_shape, - dtype=self.dtype, - pin_memory=pin_memory, - device=device)) + layer_kv_cache = torch.zeros(alloc_shape, + dtype=self.dtype, + pin_memory=pin_memory, + device=device) + + # If we allocated with padding for alignment reasons truncate the + # shape while preserving the aligned stride + if self.align_cache: + layer_kv_cache = layer_kv_cache[..., :entry_size] + + # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases + # when entry_shape is higher than 1D + kv_cache.append(layer_kv_cache.view(kv_cache_shape)) return kv_cache def swap_in(self, src_to_dst: torch.Tensor) -> None: @@ -108,6 +138,14 @@ class CacheEngine: self.num_kv_heads, self.head_size) + @staticmethod + def _align_cache(model_config: ModelConfig): + # Currently align_cache only applies to MLA models since the other + # cache kernels haven't been updated yet to support non-continguous + # tensors + return model_config.use_mla and current_platform.is_cuda() \ + and envs.VLLM_CUDA_MEM_ALIGN_KV_CACHE + @staticmethod def get_cache_block_size( cache_config: CacheConfig, @@ -119,14 +157,21 @@ class CacheEngine: num_attention_layers = model_config.get_num_layers_by_block_type( parallel_config, LayerBlockType.attention) - key_cache_block = cache_config.block_size * num_heads * head_size - # For MLA there is no value cache, since the latent vector - # is joint keys and values. - value_cache_block = key_cache_block if not model_config.use_mla else 0 - total = num_attention_layers * (key_cache_block + value_cache_block) if cache_config.cache_dtype == "auto": dtype = model_config.dtype else: dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] + + key_cache_entry = num_heads * head_size + if CacheEngine._align_cache(model_config): + key_cache_entry = align_to_256bytes(key_cache_entry, + model_config.dtype) + + # For MLA there is no value cache, since the latent vector + # is joint keys and values. + value_cache_entry = key_cache_entry if not model_config.use_mla else 0 + total = num_attention_layers * cache_config.block_size * \ + (key_cache_entry + value_cache_entry) + dtype_size = get_dtype_size(dtype) return dtype_size * total diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index fa6775cbd6c6683c55ed7f94a6f009046e54ae3b..71e32c5f7aca8a64941e2c783ffbb24670efe286 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 4b429b67b36f831b305c0b71dc9d504eb93e47f4..9400893105d73e5cf6683ed57231d556dab8dcf9 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import weakref from collections import defaultdict @@ -384,20 +386,17 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]): "mrope embedding type requires multi-modal input mapper " "returns 'image_grid_thw' or 'video_grid_thw'.") + second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) hf_config = self.runner.model_config.hf_config token_ids = seq_data.get_token_ids() mrope_positions, mrope_position_delta = \ MRotaryEmbedding.get_input_positions( token_ids, + hf_config=hf_config, image_grid_thw=image_grid_thw, video_grid_thw=video_grid_thw, - image_token_id=hf_config.image_token_id, - video_token_id=hf_config.video_token_id, - vision_start_token_id=hf_config.vision_start_token_id, - vision_end_token_id=hf_config.vision_end_token_id, - spatial_merge_size=hf_config.vision_config. - spatial_merge_size, + second_per_grid_ts=second_per_grid_ts, context_len=computed_len, ) seq_data.mrope_position_delta = mrope_position_delta diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index d31ba89e12375819ffce05470657400f8bb5cbb3..c0744d63b8d098aa06c8d8dbc811a7f13b9d22d8 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import Any, Dict, List, Optional, Tuple, Type, Union diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 01ed921baa8907ece231ff75dbf0c2b9de97ec73..0014e5124865ddeda9bb6b7bc82266add8affd6d 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A CPU worker class.""" from typing import Dict, List, Optional, Set, Tuple, Type diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 8a161b740042d28b47f06306bc41f3ff3a924f62..e2d338f757616b0dc140715f90f9dbb1a183260b 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import itertools from typing import Any, Dict, List, Optional, Tuple, Type, cast diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index a339c97a8383c85b2f21c309b97fe692b1739932..b846d4387ba58a1ce3adc8c0d24bdf406a68a6a6 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index aaf9cb40bf2aaa5d878d7fe36e346f8176998f8e..a1f31bead72949d1e2e7280d4e9d95e69efa9942 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 3e8dbfc4c2e727537454bf71beb883cd11c28e6a..6a5d9f11e3cacaeb84d1d5544814bc732998e47f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import gc import inspect @@ -55,7 +57,7 @@ from vllm.worker.model_runner_base import ( _add_attn_metadata_broadcastable_dict, _add_sampling_metadata_broadcastable_dict, _init_attn_metadata_from_tensor_dict, - _init_sampling_metadata_from_tensor_dict, dump_input_when_exception) + _init_sampling_metadata_from_tensor_dict) if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionBackend @@ -702,6 +704,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): "mrope embedding type requires multi-modal input mapper " "returns 'image_grid_thw' or 'video_grid_thw'.") + second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None) hf_config = self.runner.model_config.hf_config inter_data.mrope_input_positions = [None] * inter_data.n_seqs @@ -713,14 +716,10 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): mrope_input_positions, mrope_position_delta = \ MRotaryEmbedding.get_input_positions( token_ids, + hf_config=hf_config, image_grid_thw=image_grid_thw, video_grid_thw=video_grid_thw, - image_token_id=hf_config.image_token_id, - video_token_id=hf_config.video_token_id, - vision_start_token_id=hf_config.vision_start_token_id, - vision_end_token_id=hf_config.vision_end_token_id, - spatial_merge_size=hf_config.vision_config. - spatial_merge_size, + second_per_grid_ts=second_per_grid_ts, context_len=inter_data.context_lens[seq_idx], seq_len=inter_data.seq_lens[seq_idx], ) @@ -1647,7 +1646,6 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): virtual_engine=virtual_engine) @torch.inference_mode() - @dump_input_when_exception(exclude_args=[0], exclude_kwargs=["self"]) def execute_model( self, model_input: ModelInputForGPUWithSamplingMetadata, diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index aef4bdcdd4bf9a7116e025e9c02d0a87efe1ac52..38d2b712eff571cb5bd5224d83626001f237d4d8 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -1,14 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses -import pickle from abc import ABC, abstractmethod -from datetime import datetime -from functools import wraps -from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, - Optional, Type, TypeVar) +from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type, + TypeVar) import torch import torch.nn as nn -from torch import is_tensor from vllm.config import VllmConfig from vllm.logger import init_logger @@ -105,59 +103,6 @@ def _init_frozen_model_input_from_tensor_dict( return tensor_dict -def dump_input_when_exception(exclude_args: Optional[List[int]] = None, - exclude_kwargs: Optional[List[str]] = None): - - def _inner(func): - - @wraps(func) - def _wrapper(*args, **kwargs): - try: - return func(*args, **kwargs) - except Exception as err: - timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - filename = f"/tmp/err_{func.__name__}_input_{timestamp}.pkl" - logger.info("Writing input of failed execution to %s...", - filename) - with open(filename, "wb") as filep: - dumped_inputs = { - k: v - for k, v in kwargs.items() - if k not in (exclude_kwargs or []) - } - for i, arg in enumerate(args): - if i not in (exclude_args or []): - dumped_inputs[f"arg_{i}"] = arg - - # Only persist dtype and shape for kvcache tensors - # (can be way to big otherwise) - if (kv_caches := dumped_inputs.get("kv_caches")) \ - and isinstance(kv_caches, Iterable): - dumped_inputs["kv_caches"] = [(t.dtype, t.shape) - for t in kv_caches - if is_tensor(t)] - - try: - pickle.dump(dumped_inputs, filep) - except Exception as pickle_err: - logger.warning( - "Failed to pickle inputs of failed execution: %s", - str(pickle_err)) - raise type(err)(f"Error in model execution: " - f"{str(err)}") from err - - logger.info( - "Completed writing input of failed execution to %s.", - filename) - raise type(err)( - f"Error in model execution (input dumped to {filename}): " - f"{str(err)}") from err - - return _wrapper - - return _inner - - class BroadcastableModelInput(ABC): @abstractmethod diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 4aab09c80826b6ed48845421d7eb3cba313ed937..90771e8ac75d84e17050c7530f39373c44a703c3 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import functools from dataclasses import dataclass, field diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py index e654f7172b26655063cf01b4eacaa1d84e5071b9..3871199987cee1ee8bf2066272edd8688b7f562a 100644 --- a/vllm/worker/multi_step_tpu_worker.py +++ b/vllm/worker/multi_step_tpu_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import Dict, Optional, Tuple diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py index 1f982fe103366b150069914d9a4ad89d319f67b0..3518ab2f64fed8301d984000d43344c92afaae6c 100644 --- a/vllm/worker/multi_step_worker.py +++ b/vllm/worker/multi_step_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from dataclasses import dataclass from typing import Dict, List, Optional, Tuple diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 596c26eac28bde9167961a3148741797e745ba9b..f2093fc42ad16d26b3a194a6b39bc463efc9637a 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from dataclasses import dataclass from importlib.util import find_spec diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index f6f502cd28eb85e64b7ccb75bdfb58e54541238e..6f5822fc519413ce19c8e8032017a0231fe52257 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A Neuron worker class.""" from typing import List, Optional, Tuple diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 42fe2cf668ad8c66ea335ae2315f4aae0d38ef25..f7a5ab9de9fa68482ae5b9d7e529adca806fa4a6 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + from collections import defaultdict from typing import Dict, List, NamedTuple, Optional, Tuple @@ -52,15 +54,13 @@ class OpenVINOModelRunner(ModelRunnerBase): ): self.ov_core = ov_core ModelRunnerBase.__init__(self, vllm_config=vllm_config) - cache_config = self.cache_config - model_config = self.model_config self.is_driver_worker = is_driver_worker self.device = self.device_config.device self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = model_config.get_sliding_window() - self.block_size = cache_config.block_size + self.sliding_window = self.model_config.get_sliding_window() + self.block_size = self.cache_config.block_size self.attn_backend = get_attn_backend( self.model_config.get_head_size(), @@ -79,8 +79,7 @@ class OpenVINOModelRunner(ModelRunnerBase): self.model: nn.Module # Set after init_Model def load_model(self) -> None: - self.model = get_model(model_config=self.model_config, - device_config=self.device_config, + self.model = get_model(vllm_config=self.vllm_config, kv_cache_dtype=self.kv_cache_dtype, ov_core=self.ov_core) diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index f5b46cde3969ce7beee566ba72ebe8174c9feee7..0690222d91afafc2472e8358bb21aac85b766f88 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """An OpenVINO worker class.""" from typing import Any, Dict, List, Optional, Tuple diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index 6de227f3cb2b90036d9b2550e72eeabce8f35f59..f43085b0e969a5b751ad2d7bf87bdceec65533fd 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses from typing import Any, Dict, List, Optional, Tuple, Type, Union diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 8749518284288e11fff730a898fd9c6abc299f35..ecdf7aa8889670957c1f369d02c439658f8ea395 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import enum import time from dataclasses import dataclass diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index ee2ce9d8a9e644c11b7312c255ee2735373215f5..20ed8ec05594e05aa2cd8467577dbb4dda1f8370 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import os from typing import List, Optional, Tuple, Union diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index ffa8c4cb0ff46ba0c94e23af0b14e008c7a20785..d925f088357b527d553ebbce1b3670aa179c1cd9 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 ''' Worker-related helper functions. ''' diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index b8b3cc6cabf82a4954c936f02c656545d4da81a8..986c27e94e67cdefd8b6adcd38ecd77d9aa0a202 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A GPU worker class.""" import gc import os diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 0b1c03cca05b25b919d6dc7055ec6698729252e6..22a5659cb4280d0e5d52f69cc8bc638a0d05a4fa 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import os import time diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index b7b7b7227b22c874f4c71109f909c494527ac865..9c726e1a107e0737d3e30bc2223aca0bbf86e226 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 + import dataclasses import time import weakref diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index e9cb623c8eb450f7bc9aaac205caf92c83971c6c..047c0bbbc355503621097dbfd8a152b4e17b164b 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """A XPU worker class.""" import gc import os