Commit 66b809cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.2' into v0.7.2-dev

parents 37b63c24 0408efc6
# SPDX-License-Identifier: Apache-2.0
import os import os
import sys import sys
import zipfile import zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
# Note that we have 400 MiB quota, please use it wisely. # Note that we have 400 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/3792 . # See https://github.com/pypi/support/issues/3792 .
# Please also sync the value with the one in Dockerfile. # Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300)) VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
def print_top_10_largest_files(zip_file): def print_top_10_largest_files(zip_file):
......
# SPDX-License-Identifier: Apache-2.0
import argparse import argparse
import os import os
......
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.6353
- name: "exact_match,flexible-extract"
value: 0.637
limit: null
num_fewshot: null
# SPDX-License-Identifier: Apache-2.0
""" """
LM eval harness on model to compare vs HF baseline computed offline. LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml Configs are found in configs/$MODEL.yaml
......
# SPDX-License-Identifier: Apache-2.0
import json import json
import os import os
from pathlib import Path from pathlib import Path
......
# SPDX-License-Identifier: Apache-2.0
import argparse import argparse
from transformers import AutoTokenizer from transformers import AutoTokenizer
......
# SPDX-License-Identifier: Apache-2.0
import argparse import argparse
import json import json
from pathlib import Path from pathlib import Path
......
# SPDX-License-Identifier: Apache-2.0
from lmdeploy.serve.openai.api_client import APIClient from lmdeploy.serve.openai.api_client import APIClient
api_client = APIClient("http://localhost:8000") api_client = APIClient("http://localhost:8000")
......
# SPDX-License-Identifier: Apache-2.0
import datetime import datetime
import json import json
import os import os
......
...@@ -23,6 +23,6 @@ trap remove_docker_container EXIT ...@@ -23,6 +23,6 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and test offline inference # Run the image and test offline inference
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/basic.py python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
' '
...@@ -50,9 +50,9 @@ steps: ...@@ -50,9 +50,9 @@ steps:
- tests/multimodal - tests/multimodal
- tests/test_utils - tests/test_utils
- tests/worker - tests/worker
- tests/standalone_tests/lazy_torch_compile.py - tests/standalone_tests/lazy_imports.py
commands: commands:
- python3 standalone_tests/lazy_torch_compile.py - python3 standalone_tests/lazy_imports.py
- pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine - pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
...@@ -128,6 +128,7 @@ steps: ...@@ -128,6 +128,7 @@ steps:
- tests/spec_decode/e2e/test_integration_dist_tp4 - tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile - tests/compile
- examples/offline_inference/rlhf.py - examples/offline_inference/rlhf.py
- examples/offline_inference/ray_placement.py
commands: commands:
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
...@@ -136,6 +137,7 @@ steps: ...@@ -136,6 +137,7 @@ steps:
# TODO: create a dedicated test section for multi-GPU example tests # TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests # when we have multiple distributed example tests
- python3 ../examples/offline_inference/rlhf.py - python3 ../examples/offline_inference/rlhf.py
- RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/ray_placement.py
- label: Metrics, Tracing Test # 10min - label: Metrics, Tracing Test # 10min
num_gpus: 2 num_gpus: 2
...@@ -349,6 +351,7 @@ steps: ...@@ -349,6 +351,7 @@ steps:
- vllm/ - vllm/
- tests/models - tests/models
commands: commands:
- pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py - pytest -v -s models/test_registry.py
- pytest -v -s models/test_initialization.py - pytest -v -s models/test_initialization.py
...@@ -485,6 +488,7 @@ steps: ...@@ -485,6 +488,7 @@ steps:
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
# Avoid importing model tests that cause CUDA reinitialization error # Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
......
...@@ -30,15 +30,6 @@ body: ...@@ -30,15 +30,6 @@ body:
</details> </details>
validations: validations:
required: true required: true
- type: textarea
attributes:
label: Model Input Dumps
description: |
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
placeholder: |
Upload the dumped input file.
validations:
required: false
- type: textarea - type: textarea
attributes: attributes:
label: 🐛 Describe the bug label: 🐛 Describe the bug
......
...@@ -2,7 +2,6 @@ name: PR Reminder Comment Bot ...@@ -2,7 +2,6 @@ name: PR Reminder Comment Bot
on: on:
pull_request_target: pull_request_target:
types: [opened] types: [opened]
jobs: jobs:
pr_reminder: pr_reminder:
runs-on: ubuntu-latest runs-on: ubuntu-latest
...@@ -15,7 +14,12 @@ jobs: ...@@ -15,7 +14,12 @@ jobs:
owner: context.repo.owner, owner: context.repo.owner,
repo: context.repo.repo, repo: context.repo.repo,
issue_number: context.issue.number, issue_number: context.issue.number,
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀' body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
'🚀'
}) })
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
...@@ -97,10 +97,14 @@ repos: ...@@ -97,10 +97,14 @@ repos:
language: system language: system
verbose: true verbose: true
stages: [commit-msg] stages: [commit-msg]
- id: check-spdx-header
name: Check SPDX headers
entry: python tools/check_spdx_header.py
language: python
types: [python]
- id: suggestion - id: suggestion
name: Suggestion name: Suggestion
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."' entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
language: system language: system
verbose: true verbose: true
pass_filenames: false pass_filenames: false
...@@ -61,7 +61,7 @@ representative at an online or offline/IRL event. ...@@ -61,7 +61,7 @@ representative at an online or offline/IRL event.
Instances of abusive, harassing, or otherwise unacceptable behavior may be Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement in the #code-of-conduct reported to the community leaders responsible for enforcement in the #code-of-conduct
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g). channel in the [vLLM Slack](https://slack.vllm.ai).
All complaints will be reviewed and investigated promptly and fairly. All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the All community leaders are obligated to respect the privacy and security of the
......
...@@ -127,7 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ ...@@ -127,7 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Check the size of the wheel if RUN_WHEEL_CHECK is true # Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py
ARG VLLM_MAX_SIZE_MB=300 ARG VLLM_MAX_SIZE_MB=400
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
......
...@@ -83,7 +83,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install (若调试,可使用V ...@@ -83,7 +83,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install (若调试,可使用V
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.7.1; - python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.7.2;
## Known Issue ## Known Issue
- -
......
...@@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone ...@@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone
</h3> </h3>
<p align="center"> <p align="center">
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> | | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p> </p>
--- ---
...@@ -36,7 +36,7 @@ Easy, fast, and cheap LLM serving for everyone ...@@ -36,7 +36,7 @@ Easy, fast, and cheap LLM serving for everyone
## About ## About
vLLM is a fast and easy-to-use library for LLM inference and serving. vLLM is a fast and easy-to-use library for LLM inference and serving.
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry. Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
vLLM is fast with: vLLM is fast with:
...@@ -139,8 +139,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ...@@ -139,8 +139,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us ## Contact Us
* For technical questions and feature requests, please use Github issues or discussions. * For technical questions and feature requests, please use Github issues or discussions.
* For discussing with fellow users, please use Discord. * For discussing with fellow users and coordinating contributions and development, please use Slack.
* For coordinating contributions and development, please use Slack.
* For security disclosures, please use Github's security advisory feature. * For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu. * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
......
# SPDX-License-Identifier: Apache-2.0
import json import json
import os import os
import sys import sys
......
# SPDX-License-Identifier: Apache-2.0
"""Benchmark guided decoding throughput.""" """Benchmark guided decoding throughput."""
import argparse import argparse
import dataclasses import dataclasses
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment