Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 import sys
 import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
 # Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
 def print_top_10_largest_files(zip_file):

--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
+# SPDX-License-Identifier: Apache-2.0
 import argparse
 import os

--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
+model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.6353
+  - name: "exact_match,flexible-extract"
+    value: 0.637
+limit: null
+num_fewshot: null 
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+# SPDX-License-Identifier: Apache-2.0
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml

--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 import os
 from pathlib import Path

--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+# SPDX-License-Identifier: Apache-2.0
 import argparse
 from transformers import AutoTokenizer

--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+# SPDX-License-Identifier: Apache-2.0
 import argparse
 import json
 from pathlib import Path

--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+# SPDX-License-Identifier: Apache-2.0
 from lmdeploy.serve.openai.api_client import APIClient
 api_client = APIClient("http://localhost:8000")

--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+# SPDX-License-Identifier: Apache-2.0
 import datetime
 import json
 import os

--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -23,6 +23,6 @@ trap remove_docker_container EXIT
 remove_docker_container
 # Run the image and test offline inference
-docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic.py
+    python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -50,9 +50,9 @@ steps:
  - tests/multimodal
  - tests/test_utils
  - tests/worker
-  - tests/standalone_tests/lazy_torch_compile.py
+  - tests/standalone_tests/lazy_imports.py
  commands:
-  - python3 standalone_tests/lazy_torch_compile.py
+  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s mq_llm_engine # MQLLMEngine
  - pytest -v -s async_engine # AsyncLLMEngine
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@@ -128,6 +128,7 @@ steps:
  - tests/spec_decode/e2e/test_integration_dist_tp4
  - tests/compile
  - examples/offline_inference/rlhf.py
+  - examples/offline_inference/ray_placement.py
  commands:
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
@@ -136,6 +137,7 @@ steps:
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - python3 ../examples/offline_inference/rlhf.py
+  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/ray_placement.py
 - label: Metrics, Tracing Test # 10min
  num_gpus: 2 
@@ -349,6 +351,7 @@ steps:
  - vllm/
  - tests/models
  commands:
+    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
    - pytest -v -s models/test_initialization.py
@@ -485,6 +488,7 @@ steps:
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'

--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -30,15 +30,6 @@ body:
      </details>
  validations:
    required: true
- type: textarea
-  attributes:
-    label: Model Input Dumps
-    description: |
-      If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
-    placeholder: |
-      Upload the dumped input file.
-  validations:
-    required: false
 - type: textarea
  attributes:
    label: 🐛 Describe the bug

--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -2,7 +2,6 @@ name: PR Reminder Comment Bot
 on:
  pull_request_target:
    types: [opened]
 jobs:
  pr_reminder:
    runs-on: ubuntu-latest
@@ -15,7 +14,12 @@ jobs:
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
+                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
+                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
+                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
+                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
+                '🚀'
            })
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -97,10 +97,14 @@ repos:
    language: system
    verbose: true
    stages: [commit-msg]
+  - id: check-spdx-header
+    name: Check SPDX headers
+    entry: python tools/check_spdx_header.py
+    language: python
+    types: [python]
  - id: suggestion
    name: Suggestion
    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
    language: system
    verbose: true
    pass_filenames: false
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -61,7 +61,7 @@ representative at an online or offline/IRL event.
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement in the #code-of-conduct
-channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
+channel in the [vLLM Slack](https://slack.vllm.ai).
 All complaints will be reviewed and investigated promptly and fairly.
 All community leaders are obligated to respect the privacy and security of the

--- a/Dockerfile
+++ b/Dockerfile
@@ -127,7 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=300
+ARG VLLM_MAX_SIZE_MB=400
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \

--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install （若调试，可使用V
 + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.7.1;
+- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.7.2;
 ## Known Issue
 - 无

--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 ---
@@ -36,7 +36,7 @@ Easy, fast, and cheap LLM serving for everyone
 ## About
 vLLM is a fast and easy-to-use library for LLM inference and serving.
-Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
 vLLM is fast with:
@@ -139,8 +139,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 ## Contact Us
 * For technical questions and feature requests, please use Github issues or discussions.
-* For discussing with fellow users, please use Discord.
+* For discussing with fellow users and coordinating contributions and development, please use Slack.
-* For coordinating contributions and development, please use Slack.
 * For security disclosures, please use Github's security advisory feature.
 * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 import os
 import sys

--- a/benchmarks/benchmark_guided.py
+++ b/benchmarks/benchmark_guided.py
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark guided decoding throughput."""
 import argparse
 import dataclasses