"lib/bindings/python/vscode:/vscode.git/clone" did not exist on "4ca1679ce173426665595f84f8648180cf2d440f"
Commit 4eabe123 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

parents 45840cd2 58738772
...@@ -6,11 +6,6 @@ ...@@ -6,11 +6,6 @@
[tool.ruff] [tool.ruff]
line-length = 88 line-length = 88
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"] "vllm/third_party/**" = ["ALL"]
......
...@@ -64,7 +64,7 @@ steps: ...@@ -64,7 +64,7 @@ steps:
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins: plugins:
- docker-login#v3.0.0: - docker-login#v3.0.0:
username: vllm username: vllmbot
password-env: DOCKERHUB_TOKEN password-env: DOCKERHUB_TOKEN
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
......
...@@ -10,15 +10,17 @@ docker build -t hpu-test-env -f docker/Dockerfile.hpu . ...@@ -10,15 +10,17 @@ docker build -t hpu-test-env -f docker/Dockerfile.hpu .
# Setup cleanup # Setup cleanup
# certain versions of HPU software stack have a bug that can # certain versions of HPU software stack have a bug that can
# override the exit code of the script, so we need to use # override the exit code of the script, so we need to use
# separate remove_docker_container and remove_docker_container_and_exit # separate remove_docker_containers and remove_docker_containers_and_exit
# functions, while other platforms only need one remove_docker_container # functions, while other platforms only need one remove_docker_container
# function. # function.
EXITCODE=1 EXITCODE=1
remove_docker_container() { docker rm -f hpu-test || true; } remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; } remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
trap remove_docker_container_and_exit EXIT trap remove_docker_containers_and_exit EXIT
remove_docker_container remove_docker_containers
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
EXITCODE=$? EXITCODE=$?
...@@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" ...@@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
HF_CACHE="$(realpath ~)/huggingface" HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}" mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface" HF_MOUNT="/root/.cache/huggingface"
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache" NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
mkdir -p "${NEURON_COMPILE_CACHE_URL}" mkdir -p "${NEURON_COMPILE_CACHE_URL}"
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache" NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
# Try building the docker image # Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
# prune old image and containers to save disk space, and only once a day # prune old image and containers to save disk space, and only once a day
# by using a timestamp file in tmp. # by using a timestamp file in tmp.
...@@ -47,8 +48,16 @@ trap remove_docker_container EXIT ...@@ -47,8 +48,16 @@ trap remove_docker_container EXIT
docker run --rm -it --device=/dev/neuron0 --network bridge \ docker run --rm -it --device=/dev/neuron0 --network bridge \
-v "${HF_CACHE}:${HF_MOUNT}" \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \ -e "HF_HOME=${HF_MOUNT}" \
-e "HF_TOKEN=${HF_TOKEN}" \
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \ --name "${container_name}" \
${image_name} \ ${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys" /bin/bash -c "
python3 /workspace/vllm/examples/offline_inference/neuron.py;
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
echo 'Running test file: '$f;
python3 -m pytest \$f -v --capture=tee-sys;
done
"
\ No newline at end of file
...@@ -33,14 +33,13 @@ steps: ...@@ -33,14 +33,13 @@ steps:
- label: Documentation Build # 2min - label: Documentation Build # 2min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/test_docs/docs" working_dir: "/vllm-workspace/test_docs"
fast_check: true fast_check: true
no_gpu: True no_gpu: True
commands: commands:
- pip install -r ../../requirements/docs.txt - pip install -r ../requirements/docs.txt
- SPHINXOPTS=\"-W\" make html # TODO: add `--strict` once warnings in docstrings are fixed
# Check API reference (if it fails, you may have missing mock imports) - mkdocs build
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 24min - label: Async Engine, Inputs, Utils, Worker Test # 24min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -59,6 +58,7 @@ steps: ...@@ -59,6 +58,7 @@ steps:
- pytest -v -s async_engine # AsyncLLMEngine - pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s multimodal - pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils - pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker - pytest -v -s worker # Worker
...@@ -125,7 +125,7 @@ steps: ...@@ -125,7 +125,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
...@@ -138,6 +138,7 @@ steps: ...@@ -138,6 +138,7 @@ steps:
- vllm/core/ - vllm/core/
- tests/distributed/test_utils - tests/distributed/test_utils
- tests/distributed/test_pynccl - tests/distributed/test_pynccl
- tests/distributed/test_events
- tests/spec_decode/e2e/test_integration_dist_tp4 - tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile/test_basic_correctness - tests/compile/test_basic_correctness
- examples/offline_inference/rlhf.py - examples/offline_inference/rlhf.py
...@@ -156,6 +157,7 @@ steps: ...@@ -156,6 +157,7 @@ steps:
- pytest -v -s distributed/test_utils.py - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
# TODO: create a dedicated test section for multi-GPU example tests # TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests # when we have multiple distributed example tests
...@@ -220,6 +222,7 @@ steps: ...@@ -220,6 +222,7 @@ steps:
- pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_serial_utils.py
- pytest -v -s v1/test_utils.py - pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_oracle.py
- pytest -v -s v1/test_metrics_reader.py
# TODO: accuracy does not match, whether setting # TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100. # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e - pytest -v -s v1/e2e
...@@ -244,7 +247,7 @@ steps: ...@@ -244,7 +247,7 @@ steps:
- python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_embedding.py --seed 0 - python3 offline_inference/vision_language_embedding.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py - python3 offline_inference/encoder_decoder.py
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py - python3 offline_inference/basic/classify.py
...@@ -312,6 +315,7 @@ steps: ...@@ -312,6 +315,7 @@ steps:
- pytest -v -s compile/test_fusion.py - pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py - pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py - pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py
- label: PyTorch Fullgraph Smoke Test # 9min - label: PyTorch Fullgraph Smoke Test # 9min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
...@@ -386,10 +390,12 @@ steps: ...@@ -386,10 +390,12 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/model_loader - vllm/model_executor/model_loader
- tests/tensorizer_loader - tests/tensorizer_loader
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands: commands:
- apt-get update && apt-get install -y curl libsodium23 - apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader - pytest -v -s tensorizer_loader
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Benchmarks # 9min - label: Benchmarks # 9min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
...@@ -467,10 +473,7 @@ steps: ...@@ -467,10 +473,7 @@ steps:
- pytest -v -s models/test_registry.py - pytest -v -s models/test_registry.py
- pytest -v -s models/test_utils.py - pytest -v -s models/test_utils.py
- pytest -v -s models/test_vision.py - pytest -v -s models/test_vision.py
# V1 Test: https://github.com/vllm-project/vllm/issues/14531 - pytest -v -s models/test_initialization.py
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
- label: Language Models Test (Standard) - label: Language Models Test (Standard)
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
...@@ -484,16 +487,25 @@ steps: ...@@ -484,16 +487,25 @@ steps:
- pip freeze | grep -E 'torch' - pip freeze | grep -E 'torch'
- pytest -v -s models/language -m core_model - pytest -v -s models/language -m core_model
- label: Language Models Test (Extended) - label: Language Models Test (Extended Generation) # 1hr20min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/language - tests/models/language/generation
commands: commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/language -m 'not core_model' - pytest -v -s models/language/generation -m 'not core_model'
- label: Language Models Test (Extended Pooling) # 36min
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
- vllm/
- tests/models/language/pooling
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
- label: Multi-Modal Models Test (Standard) - label: Multi-Modal Models Test (Standard)
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
/vllm/model_executor/guided_decoding @mgoin @russellb /vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/multimodal @DarkLight1337 @ywang96 /vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson /vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
CMakeLists.txt @tlrmchlsmth CMakeLists.txt @tlrmchlsmth
# vLLM V1 # vLLM V1
...@@ -40,3 +41,8 @@ CMakeLists.txt @tlrmchlsmth ...@@ -40,3 +41,8 @@ CMakeLists.txt @tlrmchlsmth
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
/tests/v1/structured_output @mgoin @russellb /tests/v1/structured_output @mgoin @russellb
/tests/weight_loading @mgoin @youkaichao /tests/weight_loading @mgoin @youkaichao
/tests/lora @jeejeelee
# Docs
/docs @hmellor
mkdocs.yaml @hmellor
\ No newline at end of file
...@@ -81,14 +81,14 @@ body: ...@@ -81,14 +81,14 @@ body:
required: true required: true
- type: markdown - type: markdown
attributes: attributes:
value: > value: |
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output: ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc). - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
Thanks for contributing 🎉! Thanks for reporting 🙏!
- type: checkboxes - type: checkboxes
id: askllm id: askllm
attributes: attributes:
......
name: 🧪 CI failure report
description: Report a failing test.
title: "[CI Failure]: "
labels: ["ci-failure"]
body:
- type: markdown
attributes:
value: >
#### Include the name of the failing Buildkite step and test file in the title.
- type: input
attributes:
label: Name of failing test
description: |
Paste in the fully-qualified name of the failing test from the logs.
placeholder: |
`path/to/test_file.py::test_name[params]`
validations:
required: true
- type: checkboxes
attributes:
label: Basic information
description: Select all items that apply to the failing test.
options:
- label: Flaky test
- label: Can reproduce locally
- label: Caused by external libraries (e.g. bug in `transformers`)
- type: textarea
attributes:
label: 🧪 Describe the failing test
description: |
Please provide a clear and concise description of the failing test.
placeholder: |
A clear and concise description of the failing test.
```
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
```
validations:
required: true
- type: textarea
attributes:
label: 📝 History of failing test
description: |
Since when did the test start to fail?
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
placeholder: |
Approximate timeline and/or problematic PRs
A link to the Buildkite analytics of the failing test (if available)
validations:
required: true
- type: textarea
attributes:
label: CC List.
description: >
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
- type: markdown
attributes:
value: >
Thanks for reporting 🙏!
...@@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE ...@@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
FIX #xxxx (*link existing issues this PR will resolve*) FIX #xxxx (*link existing issues this PR will resolve*)
<!--- pyml disable-next-line no-emphasis-as-heading --> <!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions) **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
...@@ -58,7 +58,7 @@ pull_request_rules: ...@@ -58,7 +58,7 @@ pull_request_rules:
- files~=^benchmarks/structured_schemas/ - files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py - files=benchmarks/benchmark_serving_structured_output.py
- files=benchmarks/run_structured_output_benchmark.sh - files=benchmarks/run_structured_output_benchmark.sh
- files=docs/source/features/structured_outputs.md - files=docs/features/structured_outputs.md
- files=examples/offline_inference/structured_outputs.py - files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
...@@ -135,9 +135,7 @@ pull_request_rules: ...@@ -135,9 +135,7 @@ pull_request_rules:
- files~=^tests/entrypoints/openai/tool_parsers/ - files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/ - files~=^vllm/entrypoints/openai/tool_parsers/
- files=docs/source/features/tool_calling.md - files=docs/features/tool_calling.md
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
- files=docs/source/getting_started/examples/chat_with_tools.md
- files~=^examples/tool_chat_* - files~=^examples/tool_chat_*
- files=examples/offline_inference/chat_with_tools.py - files=examples/offline_inference/chat_with_tools.py
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
......
...@@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}" ...@@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)" # Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
python3 - <<EOF python3 - <<EOF
import re import regex as re
with open("${NEW}", "r") as file: with open("${NEW}", "r") as file:
content = file.read() content = file.read()
......
...@@ -20,7 +20,12 @@ jobs: ...@@ -20,7 +20,12 @@ jobs:
with: with:
python-version: '3.12' python-version: '3.12'
- name: Install Python dependencies
run: |
python3 -m pip install --upgrade pip
python3 -m pip install regex
- name: Update PR description - name: Update PR description
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}" run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
...@@ -77,11 +77,6 @@ instance/ ...@@ -77,11 +77,6 @@ instance/
# Scrapy stuff: # Scrapy stuff:
.scrapy .scrapy
# Sphinx documentation
docs/_build/
docs/source/getting_started/examples/
docs/source/api/vllm
# PyBuilder # PyBuilder
.pybuilder/ .pybuilder/
target/ target/
...@@ -151,6 +146,7 @@ venv.bak/ ...@@ -151,6 +146,7 @@ venv.bak/
# mkdocs documentation # mkdocs documentation
/site /site
docs/examples
# mypy # mypy
.mypy_cache/ .mypy_cache/
......
...@@ -17,7 +17,7 @@ repos: ...@@ -17,7 +17,7 @@ repos:
- id: ruff - id: ruff
args: [--output-format, github, --fix] args: [--output-format, github, --fix]
- id: ruff-format - id: ruff-format
files: ^(.buildkite|benchmarks)/.* files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.4.1 rev: v2.4.1
hooks: hooks:
...@@ -39,6 +39,7 @@ repos: ...@@ -39,6 +39,7 @@ repos:
rev: v0.9.29 rev: v0.9.29
hooks: hooks:
- id: pymarkdown - id: pymarkdown
exclude: '.*\.inc\.md'
args: [fix] args: [fix]
- repo: https://github.com/rhysd/actionlint - repo: https://github.com/rhysd/actionlint
rev: v1.7.7 rev: v1.7.7
...@@ -127,6 +128,21 @@ repos: ...@@ -127,6 +128,21 @@ repos:
name: Update Dockerfile dependency graph name: Update Dockerfile dependency graph
entry: tools/update-dockerfile-graph.sh entry: tools/update-dockerfile-graph.sh
language: script language: script
- id: enforce-import-regex-instead-of-re
name: Enforce import regex as re
entry: python tools/enforce_regex_import.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [regex]
# forbid directly import triton
- id: forbid-direct-triton-import
name: "Forbid direct 'import triton'"
entry: python tools/check_triton_import.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [regex]
# Keep `suggestion` last # Keep `suggestion` last
- id: suggestion - id: suggestion
name: Suggestion name: Suggestion
......
...@@ -8,12 +8,8 @@ build: ...@@ -8,12 +8,8 @@ build:
tools: tools:
python: "3.12" python: "3.12"
sphinx: mkdocs:
configuration: docs/source/conf.py configuration: mkdocs.yaml
fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF
formats: []
# Optionally declare the Python requirements required to build your docs # Optionally declare the Python requirements required to build your docs
python: python:
......
...@@ -29,9 +29,6 @@ set(ignoreMe "${VLLM_PYTHON_PATH}") ...@@ -29,9 +29,6 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
# #
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
# Supported AMD GPU architectures. # Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx928;gfx936") set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx928;gfx936")
...@@ -79,6 +76,15 @@ endif() ...@@ -79,6 +76,15 @@ endif()
# #
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
# Supported NVIDIA architectures.
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
else()
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
endif()
# #
# Forward the non-CUDA device extensions to external CMake scripts. # Forward the non-CUDA device extensions to external CMake scripts.
# #
...@@ -226,6 +232,8 @@ endif() ...@@ -226,6 +232,8 @@ endif()
# #
set(VLLM_EXT_SRC set(VLLM_EXT_SRC
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/cache_kernels.cu" "csrc/cache_kernels.cu"
"csrc/attention/paged_attention_v1.cu" "csrc/attention/paged_attention_v1.cu"
"csrc/attention/paged_attention_v2.cu" "csrc/attention/paged_attention_v2.cu"
...@@ -282,8 +290,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -282,8 +290,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_MakeAvailable(cutlass) FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC list(APPEND VLLM_EXT_SRC
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu" "csrc/permute_cols.cu"
......
# Contributing to vLLM # Contributing to vLLM
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html). You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
<p align="center"> <p align="center">
<picture> <picture>
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png"> <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%> <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%>
</picture> </picture>
</p> </p>
...@@ -58,7 +58,7 @@ vLLM is fast with: ...@@ -58,7 +58,7 @@ vLLM is fast with:
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests - Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph - Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8. - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. - Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Speculative decoding - Speculative decoding
- Chunked prefill - Chunked prefill
...@@ -100,14 +100,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. ...@@ -100,14 +100,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
## Contributing ## Contributing
We welcome and value any contributions and collaborations. We welcome and value any contributions and collaborations.
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved. Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
## Sponsors ## Sponsors
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
<!-- Note: Please sort them in alphabetical order. --> <!-- Note: Please sort them in alphabetical order. -->
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md --> <!-- Note: Please keep these consistent with docs/community/sponsors.md -->
Cash Donations: Cash Donations:
- a16z - a16z
- Dropbox - Dropbox
......
...@@ -146,10 +146,9 @@ python3 vllm/benchmarks/benchmark_serving.py \ ...@@ -146,10 +146,9 @@ python3 vllm/benchmarks/benchmark_serving.py \
``` bash ``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
--speculative-model "[ngram]" \
--ngram_prompt_lookup_min 2 \ --ngram_prompt_lookup_min 2 \
--ngram-prompt-lookup-max 5 \ --ngram-prompt-lookup-max 5 \
--num_speculative_tokens 5 --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
``` ```
``` bash ``` bash
...@@ -274,10 +273,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \ ...@@ -274,10 +273,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \
--output-len=100 \ --output-len=100 \
--num-prompts=2048 \ --num-prompts=2048 \
--async-engine \ --async-engine \
--speculative-model="[ngram]" \
--ngram_prompt_lookup_min=2 \ --ngram_prompt_lookup_min=2 \
--ngram-prompt-lookup-max=5 \ --ngram-prompt-lookup-max=5 \
--num_speculative_tokens=5 --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
``` ```
``` ```
......
...@@ -194,6 +194,11 @@ async def async_request_deepspeed_mii( ...@@ -194,6 +194,11 @@ async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput, request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
"OpenAI Completions API URL must end with 'completions' or 'profile'."
)
async with aiohttp.ClientSession( async with aiohttp.ClientSession(
trust_env=True, timeout=AIOHTTP_TIMEOUT trust_env=True, timeout=AIOHTTP_TIMEOUT
) as session: ) as session:
...@@ -204,6 +209,8 @@ async def async_request_deepspeed_mii( ...@@ -204,6 +209,8 @@ async def async_request_deepspeed_mii(
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp. "temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
"top_p": 1.0, "top_p": 1.0,
} }
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
output = RequestFuncOutput() output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len output.prompt_len = request_func_input.prompt_len
...@@ -215,7 +222,7 @@ async def async_request_deepspeed_mii( ...@@ -215,7 +222,7 @@ async def async_request_deepspeed_mii(
st = time.perf_counter() st = time.perf_counter()
try: try:
async with session.post( async with session.post(
url=request_func_input.api_url, json=payload url=api_url, json=payload, headers=headers
) as response: ) as response:
if response.status == 200: if response.status == 200:
parsed_resp = await response.json() parsed_resp = await response.json()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment