Commit 4851c202 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.1' into v0.6.1-dev

parents 9b902f9e 3fd2b0d2
...@@ -71,13 +71,36 @@ mkdir -p ${HF_CACHE} ...@@ -71,13 +71,36 @@ mkdir -p ${HF_CACHE}
HF_MOUNT="/root/.cache/huggingface" HF_MOUNT="/root/.cache/huggingface"
commands=$@ commands=$@
echo "Commands:$commands"
#ignore certain kernels tests
if [[ $commands == *" kernels "* ]]; then
commands="${commands} \
--ignore=kernels/test_attention.py \
--ignore=kernels/test_attention_selector.py \
--ignore=kernels/test_blocksparse_attention.py \
--ignore=kernels/test_causal_conv1d.py \
--ignore=kernels/test_cutlass.py \
--ignore=kernels/test_encoder_decoder_attn.py \
--ignore=kernels/test_flash_attn.py \
--ignore=kernels/test_flashinfer.py \
--ignore=kernels/test_int8_quant.py \
--ignore=kernels/test_machete_gemm.py \
--ignore=kernels/test_mamba_ssm.py \
--ignore=kernels/test_marlin_gemm.py \
--ignore=kernels/test_moe.py \
--ignore=kernels/test_prefix_prefill.py \
--ignore=kernels/test_rand.py \
--ignore=kernels/test_sampler.py"
fi
PARALLEL_JOB_COUNT=8 PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then if [[ $commands == *"--shard-id="* ]]; then
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
#replace shard arguments #replace shard arguments
commands=${@//"--shard-id= "/"--shard-id=${GPU} "} commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
echo "Shard ${GPU} commands:$commands"
docker run \ docker run \
--device /dev/kfd --device /dev/dri \ --device /dev/kfd --device /dev/dri \
--network host \ --network host \
......
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
docker build -t cpu-test -f Dockerfile.ppc64le .
# Setup cleanup
remove_docker_container() { docker rm -f cpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
source /etc/environment
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
# online inference
docker exec cpu-test bash -c "
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
...@@ -30,6 +30,12 @@ docker exec cpu-test bash -c " ...@@ -30,6 +30,12 @@ docker exec cpu-test bash -c "
--ignore=tests/models/test_jamba.py \ --ignore=tests/models/test_jamba.py \
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
# Run compressed-tensor test
docker exec cpu-test bash -c "
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
# online inference # online inference
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
export VLLM_CPU_KVCACHE_SPACE=10 export VLLM_CPU_KVCACHE_SPACE=10
......
...@@ -158,6 +158,7 @@ steps: ...@@ -158,6 +158,7 @@ steps:
- python3 offline_inference_with_prefix.py - python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py - python3 llm_engine_example.py
- python3 offline_inference_vision_language.py - python3 offline_inference_vision_language.py
- python3 offline_inference_vision_language_multi_image.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py - python3 offline_inference_encoder_decoder.py
...@@ -216,7 +217,8 @@ steps: ...@@ -216,7 +217,8 @@ steps:
commands: commands:
# See https://github.com/vllm-project/vllm/issues/5152 # See https://github.com/vllm-project/vllm/issues/5152
- export VLLM_ATTENTION_BACKEND=XFORMERS - export VLLM_ATTENTION_BACKEND=XFORMERS
- pytest -v -s spec_decode - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
- label: LoRA Test %N # 30min each - label: LoRA Test %N # 30min each
mirror_hardwares: [amd] mirror_hardwares: [amd]
...@@ -227,6 +229,7 @@ steps: ...@@ -227,6 +229,7 @@ steps:
parallelism: 4 parallelism: 4
- label: Kernels Test %N # 30min each - label: Kernels Test %N # 30min each
mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/attention - vllm/attention
...@@ -368,6 +371,7 @@ steps: ...@@ -368,6 +371,7 @@ steps:
- label: LoRA Long Context (Distributed) # 11min - label: LoRA Long Context (Distributed) # 11min
# This test runs llama 13B, so it is required to run on 4 GPUs. # This test runs llama 13B, so it is required to run on 4 GPUs.
num_gpus: 4 num_gpus: 4
soft_fail: true
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora/test_long_context - tests/lora/test_long_context
...@@ -384,7 +388,18 @@ steps: ...@@ -384,7 +388,18 @@ steps:
- vllm/ - vllm/
- tests/weight_loading - tests/weight_loading
commands: commands:
- bash weight_loading/run_model_weight_loading_test.sh - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- label: Weight Loading Multiple GPU Test - Large Models # optional
working_dir: "/vllm-workspace/tests"
num_gpus: 2
gpu: a100
optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
##### multi gpus test ##### ##### multi gpus test #####
......
...@@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*) ...@@ -39,6 +39,16 @@ FIX #xxxx (*link existing issues this PR will resolve*)
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li> <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
</ul> </ul>
<h3>Adding or changing kernels</h3>
<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
<ul>
<li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
<li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
<li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops. See <code>tests/kernels</code> for examples.</li>
<li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
<li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
</ul>
<h3>Notes for Large Changes</h3> <h3>Notes for Large Changes</h3>
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p> <p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
......
...@@ -205,9 +205,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -205,9 +205,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_Declare( FetchContent_Declare(
cutlass cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git GIT_REPOSITORY https://github.com/nvidia/cutlass.git
# CUTLASS 3.5.1 GIT_TAG v3.5.1
GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9
GIT_PROGRESS TRUE GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
GIT_SHALLOW TRUE
) )
FetchContent_MakeAvailable(cutlass) FetchContent_MakeAvailable(cutlass)
...@@ -241,6 +245,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") ...@@ -241,6 +245,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"-gencode arch=compute_90a,code=sm_90a") "-gencode arch=compute_90a,code=sm_90a")
endif() endif()
# #
# Machete kernels # Machete kernels
...@@ -299,6 +304,12 @@ define_gpu_extension_target( ...@@ -299,6 +304,12 @@ define_gpu_extension_target(
USE_SABI 3 USE_SABI 3
WITH_SOABI) WITH_SOABI)
# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
# driver API. This causes problems when linking with earlier versions of CUDA.
# Setting this variable sidesteps the issue by calling the driver directly.
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
# #
# _moe_C extension # _moe_C extension
# #
......
# vLLM Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socioeconomic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
community
Examples of unacceptable behavior include:
* The use of sexualized language or imagery, and sexual attention or advances of
any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official email address,
posting via an official social media account, or acting as an appointed
representative at an online or offline/IRL event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement in the #code-of-conduct
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series of
actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within the
community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
version 2.1, available at
[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
For answers to common questions about this code of conduct, see the
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
...@@ -10,7 +10,7 @@ ARG CUDA_VERSION=12.4.1 ...@@ -10,7 +10,7 @@ ARG CUDA_VERSION=12.4.1
# prepare basic build environment # prepare basic build environment
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.4.1 ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.10 ARG PYTHON_VERSION=3.12
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies # Install Python and other dependencies
...@@ -37,7 +37,6 @@ WORKDIR /workspace ...@@ -37,7 +37,6 @@ WORKDIR /workspace
# install build and runtime dependencies # install build and runtime dependencies
COPY requirements-common.txt requirements-common.txt COPY requirements-common.txt requirements-common.txt
COPY requirements-adag.txt requirements-adag.txt
COPY requirements-cuda.txt requirements-cuda.txt COPY requirements-cuda.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-cuda.txt python3 -m pip install -r requirements-cuda.txt
...@@ -66,7 +65,6 @@ COPY setup.py setup.py ...@@ -66,7 +65,6 @@ COPY setup.py setup.py
COPY cmake cmake COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt COPY CMakeLists.txt CMakeLists.txt
COPY requirements-common.txt requirements-common.txt COPY requirements-common.txt requirements-common.txt
COPY requirements-adag.txt requirements-adag.txt
COPY requirements-cuda.txt requirements-cuda.txt COPY requirements-cuda.txt requirements-cuda.txt
COPY pyproject.toml pyproject.toml COPY pyproject.toml pyproject.toml
COPY vllm vllm COPY vllm vllm
...@@ -135,7 +133,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -135,7 +133,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
# image with vLLM installed # image with vLLM installed
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
ARG CUDA_VERSION=12.4.1 ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.10 ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
...@@ -147,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -147,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \ && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& add-apt-repository ppa:deadsnakes/ppa \ && add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
...@@ -181,6 +180,10 @@ FROM vllm-base AS test ...@@ -181,6 +180,10 @@ FROM vllm-base AS test
ADD . /vllm-workspace/ ADD . /vllm-workspace/
# install development dependencies (for testing) # install development dependencies (for testing)
# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
# This installation must complete before the test dependencies are collected and installed.
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install "setuptools>=74.1.1"
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt python3 -m pip install -r requirements-dev.txt
......
...@@ -2,9 +2,14 @@ ...@@ -2,9 +2,14 @@
FROM ubuntu:22.04 AS cpu-test-1 FROM ubuntu:22.04 AS cpu-test-1
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
RUN --mount=type=cache,target=/var/cache/apt \ RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \ apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \ && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
...@@ -25,6 +30,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -25,6 +30,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install --upgrade pip && \ pip install --upgrade pip && \
pip install -r requirements-build.txt pip install -r requirements-build.txt
# install oneDNN
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
RUN --mount=type=cache,target=/root/.cache/ccache \
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
-DONEDNN_BUILD_DOC=OFF \
-DONEDNN_BUILD_EXAMPLES=OFF \
-DONEDNN_BUILD_TESTS=OFF \
-DONEDNN_BUILD_GRAPH=OFF \
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
cmake --build ./oneDNN/build --target install --config Release
FROM cpu-test-1 AS build FROM cpu-test-1 AS build
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
...@@ -40,7 +58,6 @@ COPY ./ ./ ...@@ -40,7 +58,6 @@ COPY ./ ./
ARG VLLM_CPU_DISABLE_AVX512 ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/ccache \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \ VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
......
...@@ -6,7 +6,9 @@ FROM $BASE_IMAGE ...@@ -6,7 +6,9 @@ FROM $BASE_IMAGE
RUN echo "Base image is $BASE_IMAGE" RUN echo "Base image is $BASE_IMAGE"
# Install some basic utilities # Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y RUN apt-get update \
&& apt-get install python3 python3-pip -y \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
### Mount Point ### ### Mount Point ###
# When launching the container, mount the code directory to /app # When launching the container, mount the code directory to /app
......
...@@ -4,7 +4,8 @@ ...@@ -4,7 +4,8 @@
FROM ubuntu:22.04 AS dev FROM ubuntu:22.04 AS dev
RUN apt-get update -y && \ RUN apt-get update -y && \
apt-get install -y python3-pip git apt-get install -y python3-pip git && \
apt-get install -y ffmpeg libsm6 libxext6 libgl1
WORKDIR /workspace WORKDIR /workspace
# copy requirements # copy requirements
......
...@@ -2,21 +2,26 @@ FROM mambaorg/micromamba ...@@ -2,21 +2,26 @@ FROM mambaorg/micromamba
ARG MAMBA_DOCKERFILE_ACTIVATE=1 ARG MAMBA_DOCKERFILE_ACTIVATE=1
USER root USER root
RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1
# Some packages in requirements-cpu are installed here # Some packages in requirements-cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
# Currently these may not be available for venv or pip directly # Currently these may not be available for venv or pip directly
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
COPY ./ /workspace/vllm COPY ./ /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
# These packages will be in rocketce eventually # These packages will be in rocketce eventually
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
WORKDIR /vllm-workspace WORKDIR /workspace/
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
...@@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night ...@@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night
FROM $BASE_IMAGE FROM $BASE_IMAGE
WORKDIR /workspace WORKDIR /workspace
# Install some basic utilities
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1
# Install the TPU and Pallas dependencies. # Install the TPU and Pallas dependencies.
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
......
...@@ -9,8 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO ...@@ -9,8 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
chmod 644 /usr/share/keyrings/intel-graphics.gpg chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update -y \ RUN apt-get update -y \
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip && apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
COPY ./ /workspace/vllm COPY ./ /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
......
include LICENSE include LICENSE
include requirements-adag.txt
include requirements-common.txt include requirements-common.txt
include requirements-cuda.txt include requirements-cuda.txt
include requirements-rocm.txt include requirements-rocm.txt
......
...@@ -82,7 +82,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install ...@@ -82,7 +82,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.6.0 - python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.6.1
## Known Issue ## Known Issue
- -
......
...@@ -17,15 +17,16 @@ Easy, fast, and cheap LLM serving for everyone ...@@ -17,15 +17,16 @@ Easy, fast, and cheap LLM serving for everyone
--- ---
**vLLM & NVIDIA Triton User Meetup (Monday, September 9, 5pm-9pm PT) at Fort Mason, San Francisco** **vLLM, AMD, Anyscale Meet & Greet at [Ray Summit 2024](http://raysummit.anyscale.com) (Monday, Sept 30th, 5-7pm PT) at Marriott Marquis San Francisco**
We are excited to announce our sixth vLLM Meetup, in collaboration with NVIDIA Triton Team. We are excited to announce our special vLLM event in collaboration with AMD and Anyscale.
Join us to hear the vLLM's recent update about performance. Join us to learn more about recent advancements of vLLM on MI300X.
Register now [here](https://lu.ma/87q3nvnh) and be part of the event! Register [here](https://lu.ma/db5ld9n5) and be a part of the event!
--- ---
*Latest News* 🔥 *Latest News* 🔥
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html). - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing). - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
...@@ -129,4 +130,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ...@@ -129,4 +130,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles}, booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
year={2023} year={2023}
} }
``` ```
\ No newline at end of file
## Contact Us
* For technical questions and feature requests, please use Github issues or discussions.
* For discussing with fellow users, please use Discord.
* For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
\ No newline at end of file
...@@ -24,6 +24,7 @@ class RequestFuncInput: ...@@ -24,6 +24,7 @@ class RequestFuncInput:
model: str model: str
best_of: int = 1 best_of: int = 1
use_beam_search: bool = False use_beam_search: bool = False
logprobs: Optional[int] = None
@dataclass @dataclass
...@@ -236,6 +237,7 @@ async def async_request_openai_completions( ...@@ -236,6 +237,7 @@ async def async_request_openai_completions(
"temperature": 0.0, "temperature": 0.0,
"best_of": request_func_input.best_of, "best_of": request_func_input.best_of,
"max_tokens": request_func_input.output_len, "max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs,
"stream": True, "stream": True,
} }
headers = { headers = {
......
...@@ -10,7 +10,7 @@ import torch ...@@ -10,7 +10,7 @@ import torch
from tqdm import tqdm from tqdm import tqdm
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs
from vllm.inputs import PromptInputs from vllm.inputs import PromptInputs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
...@@ -205,13 +205,11 @@ if __name__ == '__main__': ...@@ -205,13 +205,11 @@ if __name__ == '__main__':
default=None, default=None,
help=('path to save the pytorch profiler output. Can be visualized ' help=('path to save the pytorch profiler output. Can be visualized '
'with ui.perfetto.dev or Tensorboard.')) 'with ui.perfetto.dev or Tensorboard.'))
parser.add_argument( parser.add_argument("--device",
"--device", type=str,
type=str, default="auto",
default="auto", choices=DEVICE_OPTIONS,
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], help='device type for vLLM execution')
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.')
parser.add_argument('--block-size', parser.add_argument('--block-size',
type=int, type=int,
default=16, default=16,
......
...@@ -195,8 +195,16 @@ def sample_sonnet_requests( ...@@ -195,8 +195,16 @@ def sample_sonnet_requests(
def sample_random_requests( def sample_random_requests(
input_len: int, output_len: int, num_prompts: int, range_ratio: float, prefix_len: int,
tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]: input_len: int,
output_len: int,
num_prompts: int,
range_ratio: float,
tokenizer: PreTrainedTokenizerBase,
) -> List[Tuple[str, int, int]]:
prefix_token_ids = np.random.randint(0,
tokenizer.vocab_size,
size=prefix_len).tolist()
input_lens = np.random.randint( input_lens = np.random.randint(
int(input_len * range_ratio), int(input_len * range_ratio),
...@@ -211,10 +219,12 @@ def sample_random_requests( ...@@ -211,10 +219,12 @@ def sample_random_requests(
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
input_requests = [] input_requests = []
for i in range(num_prompts): for i in range(num_prompts):
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size prompt = tokenizer.decode(prefix_token_ids +
[(offsets[i] + i + j) % tokenizer.vocab_size
for j in range(input_lens[i])]) for j in range(input_lens[i])])
input_requests.append( input_requests.append(
(prompt, int(input_lens[i]), int(output_lens[i]))) (prompt, int(prefix_len + input_lens[i]), int(output_lens[i])))
return input_requests return input_requests
...@@ -318,6 +328,7 @@ async def benchmark( ...@@ -318,6 +328,7 @@ async def benchmark(
model_id: str, model_id: str,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
input_requests: List[Tuple[str, int, int]], input_requests: List[Tuple[str, int, int]],
logprobs: Optional[int],
best_of: int, best_of: int,
use_beam_search: bool, use_beam_search: bool,
request_rate: float, request_rate: float,
...@@ -339,6 +350,7 @@ async def benchmark( ...@@ -339,6 +350,7 @@ async def benchmark(
api_url=api_url, api_url=api_url,
prompt_len=test_prompt_len, prompt_len=test_prompt_len,
output_len=test_output_len, output_len=test_output_len,
logprobs=logprobs,
best_of=best_of, best_of=best_of,
use_beam_search=use_beam_search, use_beam_search=use_beam_search,
) )
...@@ -358,6 +370,7 @@ async def benchmark( ...@@ -358,6 +370,7 @@ async def benchmark(
api_url=base_url + "/start_profile", api_url=base_url + "/start_profile",
prompt_len=test_prompt_len, prompt_len=test_prompt_len,
output_len=test_output_len, output_len=test_output_len,
logprobs=logprobs,
best_of=best_of, best_of=best_of,
use_beam_search=use_beam_search, use_beam_search=use_beam_search,
) )
...@@ -379,6 +392,7 @@ async def benchmark( ...@@ -379,6 +392,7 @@ async def benchmark(
api_url=api_url, api_url=api_url,
prompt_len=prompt_len, prompt_len=prompt_len,
output_len=output_len, output_len=output_len,
logprobs=logprobs,
best_of=best_of, best_of=best_of,
use_beam_search=use_beam_search, use_beam_search=use_beam_search,
) )
...@@ -396,6 +410,7 @@ async def benchmark( ...@@ -396,6 +410,7 @@ async def benchmark(
api_url=base_url + "/stop_profile", api_url=base_url + "/stop_profile",
prompt_len=test_prompt_len, prompt_len=test_prompt_len,
output_len=test_output_len, output_len=test_output_len,
logprobs=logprobs,
best_of=best_of, best_of=best_of,
use_beam_search=use_beam_search, use_beam_search=use_beam_search,
) )
...@@ -562,6 +577,7 @@ def main(args: argparse.Namespace): ...@@ -562,6 +577,7 @@ def main(args: argparse.Namespace):
elif args.dataset_name == "random": elif args.dataset_name == "random":
input_requests = sample_random_requests( input_requests = sample_random_requests(
prefix_len=args.random_prefix_len,
input_len=args.random_input_len, input_len=args.random_input_len,
output_len=args.random_output_len, output_len=args.random_output_len,
num_prompts=args.num_prompts, num_prompts=args.num_prompts,
...@@ -580,6 +596,7 @@ def main(args: argparse.Namespace): ...@@ -580,6 +596,7 @@ def main(args: argparse.Namespace):
model_id=model_id, model_id=model_id,
tokenizer=tokenizer, tokenizer=tokenizer,
input_requests=input_requests, input_requests=input_requests,
logprobs=args.logprobs,
best_of=args.best_of, best_of=args.best_of,
use_beam_search=args.use_beam_search, use_beam_search=args.use_beam_search,
request_rate=args.request_rate, request_rate=args.request_rate,
...@@ -721,6 +738,16 @@ if __name__ == "__main__": ...@@ -721,6 +738,16 @@ if __name__ == "__main__":
help= help=
"Number of output tokens per request, used only for sonnet dataset.", "Number of output tokens per request, used only for sonnet dataset.",
) )
parser.add_argument(
"--logprobs",
type=int,
default=None,
help=("Number of logprobs-per-token to compute & return as part of "
"the request. If unspecified, then either (1) if beam search "
"is disabled, no logprobs are computed & a single dummy "
"logprob is returned for each token; or (2) if beam search "
"is enabled 1 logprob per token is computed"),
)
parser.add_argument( parser.add_argument(
"--sonnet-prefix-len", "--sonnet-prefix-len",
type=int, type=int,
...@@ -749,6 +776,14 @@ if __name__ == "__main__": ...@@ -749,6 +776,14 @@ if __name__ == "__main__":
help="Range of sampled ratio of input/output length, " help="Range of sampled ratio of input/output length, "
"used only for random sampling.", "used only for random sampling.",
) )
parser.add_argument(
"--random-prefix-len",
type=int,
default=0,
help="Number of fixed prefix tokens before random "
" context. The length range of context in a random "
" request is [random-prefix-len, "
" random-prefix-len + random-prefix-len * random-range-ratio).")
parser.add_argument( parser.add_argument(
"--request-rate", "--request-rate",
type=float, type=float,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment