Commit 4eabe123 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

parents 45840cd2 58738772
# default base image # default base image
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04" ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
FROM $BASE_IMAGE FROM $BASE_IMAGE
...@@ -22,8 +22,7 @@ WORKDIR ${APP_MOUNT}/vllm ...@@ -22,8 +22,7 @@ WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install pytest RUN python3 -m pip install pytest
# uninstall transformers-neuronx package explicitly to avoid version conflict # uninstall transformers-neuronx package explicitly to avoid version conflict
...@@ -49,6 +48,8 @@ RUN python3 -m pip install -e tests/vllm_test_utils ...@@ -49,6 +48,8 @@ RUN python3 -m pip install -e tests/vllm_test_utils
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict # FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
# overwrite entrypoint to run bash script # overwrite entrypoint to run bash script
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
......
...@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ...@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="1a7f4dfa" ARG FA_BRANCH="1a7f4dfa"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="5a77249" ARG AITER_BRANCH="c1debd8"
ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base
......
...@@ -84,16 +84,40 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ ...@@ -84,16 +84,40 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
rustup default stable && \ rustup default stable && \
rustup show rustup show
FROM python-install AS torch
ARG TORCH_VERSION=2.7.0
ENV export _GLIBCXX_USE_CXX11_ABI=1
ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
WORKDIR /tmp
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
git clone https://github.com/pytorch/pytorch.git && \
cd pytorch && \
git checkout v2.7.0 && \
git submodule sync && \
git submodule update --init --recursive && \
uv pip install cmake ninja && \
uv pip install -r requirements.txt && \
python setup.py bdist_wheel
FROM python-install AS torch-vision FROM python-install AS torch-vision
# Install torchvision # Install torchvision
ARG TORCH_VERSION=2.7.0.dev20250304 ARG TORCH_VERSION=2.7.0
ARG TORCH_VISION_VERSION=v0.20.1 ARG TORCH_VISION_VERSION=v0.20.1
WORKDIR /tmp WORKDIR /tmp
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
git clone https://github.com/pytorch/vision.git && \ git clone https://github.com/pytorch/vision.git && \
cd vision && \ cd vision && \
git checkout $TORCH_VISION_VERSION && \ git checkout $TORCH_VISION_VERSION && \
uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \ TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
uv pip install -v $TORCH_WHL_FILE && \
python setup.py bdist_wheel python setup.py bdist_wheel
FROM python-install AS hf-xet-builder FROM python-install AS hf-xet-builder
...@@ -138,15 +162,17 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -138,15 +162,17 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \ --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \ --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
sed -i '/^torch/d' requirements/build.txt && \ sed -i '/^torch/d' requirements/build.txt && \
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \ ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \ VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \ HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
uv pip install -v \ uv pip install -v \
$ARROW_WHL_FILE \ $ARROW_WHL_FILE \
$VISION_WHL_FILE \ $VISION_WHL_FILE \
$HF_XET_WHL_FILE \ $HF_XET_WHL_FILE \
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \ $TORCH_WHL_FILE \
--index-strategy unsafe-best-match \ --index-strategy unsafe-best-match \
-r requirements/build.txt \ -r requirements/build.txt \
-r requirements/cpu.txt -r requirements/cpu.txt
......
nav:
- Home:
- vLLM: README.md
- Getting Started:
- getting_started/quickstart.md
- getting_started/installation
- Examples:
- Offline Inference: examples/offline_inference
- Online Serving: examples/online_serving
- Others: examples/others
- Quick Links:
- User Guide: usage/README.md
- Developer Guide: contributing/README.md
- API Reference: api/README.md
- Timeline:
- Roadmap: https://roadmap.vllm.ai
- Releases: https://github.com/vllm-project/vllm/releases
- User Guide:
- Summary: usage/README.md
- usage/v1_guide.md
- General:
- usage/*
- Inference and Serving:
- serving/offline_inference.md
- serving/openai_compatible_server.md
- serving/*
- serving/integrations
- Deployment:
- deployment/*
- deployment/frameworks
- deployment/integrations
- Training: training
- Configuration:
- Summary: configuration/README.md
- configuration/*
- Models:
- models/supported_models.md
- models/generative_models.md
- models/pooling_models.md
- models/extensions
- Features:
- features/compatibility_matrix.md
- features/*
- features/quantization
- Developer Guide:
- Summary: contributing/README.md
- General:
- glob: contributing/*
flatten_single_child_sections: true
- Model Implementation: contributing/model
- Design Documents:
- V0: design
- V1: design/v1
- API Reference:
- Summary: api/README.md
- Contents:
- glob: api/vllm/*
preserve_directory_names: true
- Community:
- community/*
- Blog: https://blog.vllm.ai
- Forum: https://discuss.vllm.ai
- Slack: https://slack.vllm.ai
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
clean:
@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
rm -rf "$(SOURCEDIR)/getting_started/examples"
rm -rf "$(SOURCEDIR)/api/vllm"
# vLLM documents # Welcome to vLLM
## Build the docs <figure markdown="span">
![](./assets/logos/vllm-logo-text-light.png){ align="center" alt="vLLM" class="no-scaled-link" width="60%" }
- Make sure in `docs` directory </figure>
```bash <p style="text-align:center">
cd docs <strong>Easy, fast, and cheap LLM serving for everyone
``` </strong>
</p>
- Install the dependencies:
<p style="text-align:center">
```bash <script async defer src="https://buttons.github.io/buttons.js"></script>
pip install -r ../requirements/docs.txt <a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
``` <a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
- Clean the previous build (optional but recommended): </p>
```bash vLLM is a fast and easy-to-use library for LLM inference and serving.
make clean
``` Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
- Generate the HTML documentation: vLLM is fast with:
```bash - State-of-the-art serving throughput
make html - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
``` - Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph
## Open the docs with your browser - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Serve the documentation locally: - Speculative decoding
- Chunked prefill
```bash
python -m http.server -d build/html/ vLLM is flexible and easy to use with:
```
- Seamless integration with popular HuggingFace models
This will start a local server at http://localhost:8000. You can now open your browser and view the documentation. - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor parallelism and pipeline parallelism support for distributed inference
If port 8000 is already in use, you can specify a different port, for example: - Streaming outputs
- OpenAI-compatible API server
```bash - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
python -m http.server 3000 -d build/html/ - Prefix caching support
``` - Multi-lora support
For more information, check out the following:
- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
- [vLLM Meetups][meetups]
# Summary
[](){ #configuration }
## Configuration
API documentation for vLLM's configuration classes.
- [vllm.config.ModelConfig][]
- [vllm.config.CacheConfig][]
- [vllm.config.TokenizerPoolConfig][]
- [vllm.config.LoadConfig][]
- [vllm.config.ParallelConfig][]
- [vllm.config.SchedulerConfig][]
- [vllm.config.DeviceConfig][]
- [vllm.config.SpeculativeConfig][]
- [vllm.config.LoRAConfig][]
- [vllm.config.PromptAdapterConfig][]
- [vllm.config.MultiModalConfig][]
- [vllm.config.PoolerConfig][]
- [vllm.config.DecodingConfig][]
- [vllm.config.ObservabilityConfig][]
- [vllm.config.KVTransferConfig][]
- [vllm.config.CompilationConfig][]
- [vllm.config.VllmConfig][]
[](){ #offline-inference-api }
## Offline Inference
LLM Class.
- [vllm.LLM][]
LLM Inputs.
- [vllm.inputs.PromptType][]
- [vllm.inputs.TextPrompt][]
- [vllm.inputs.TokensPrompt][]
## vLLM Engines
Engine classes for offline and online inference.
- [vllm.LLMEngine][]
- [vllm.AsyncLLMEngine][]
## Inference Parameters
Inference parameters for vLLM APIs.
[](){ #sampling-params }
[](){ #pooling-params }
- [vllm.SamplingParams][]
- [vllm.PoolingParams][]
[](){ #multi-modality }
## Multi-Modality
vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
via the `multi_modal_data` field in [vllm.inputs.PromptType][].
Looking to add your own multi-modal model? Please follow the instructions listed [here][supports-multimodal].
- [vllm.multimodal.MULTIMODAL_REGISTRY][]
### Inputs
User-facing inputs.
- [vllm.multimodal.inputs.MultiModalDataDict][]
Internal data structures.
- [vllm.multimodal.inputs.PlaceholderRange][]
- [vllm.multimodal.inputs.NestedTensors][]
- [vllm.multimodal.inputs.MultiModalFieldElem][]
- [vllm.multimodal.inputs.MultiModalFieldConfig][]
- [vllm.multimodal.inputs.MultiModalKwargsItem][]
- [vllm.multimodal.inputs.MultiModalKwargs][]
- [vllm.multimodal.inputs.MultiModalInputs][]
### Data Parsing
- [vllm.multimodal.parse][]
### Data Processing
- [vllm.multimodal.processing][]
### Memory Profiling
- [vllm.multimodal.profiling][]
### Registry
- [vllm.multimodal.registry][]
## Model Development
- [vllm.model_executor.models.interfaces_base][]
- [vllm.model_executor.models.interfaces][]
- [vllm.model_executor.models.adapters][]
search:
boost: 0.5
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment