Commit 705f6a35 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

parents af837396 4cf256ae
...@@ -17,9 +17,23 @@ ...@@ -17,9 +17,23 @@
#pragma once #pragma once
#include "base.h" #include "base.h"
#include <cudaTypedefs.h>
namespace marlin_24 { namespace marlin_24 {
// On CUDA earlier than 12.5, the ordered_metadata version of this instruction
// is not supported. On later versions of CUDA the version without ordered
// metadata results in the following warning:
// | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction
// | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially
// | reduced performance on some future architectures
#if defined CUDA_VERSION && CUDA_VERSION >= 12050
#define MMA_SP_INST \
"mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
#else
#define MMA_SP_INST "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
#endif
// m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32 // m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32
// output/accumulation. // output/accumulation.
__device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
...@@ -29,41 +43,38 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1, ...@@ -29,41 +43,38 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
const uint32_t* a1 = reinterpret_cast<const uint32_t*>(&a_frag1); const uint32_t* a1 = reinterpret_cast<const uint32_t*>(&a_frag1);
const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b); const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
const uint32_t* e = reinterpret_cast<const uint32_t*>(&frag_m); const uint32_t* e = reinterpret_cast<const uint32_t*>(&frag_m);
float* c = reinterpret_cast<float*>(&frag_c); float* c = reinterpret_cast<float*>(&frag_c);
if (psel == 0) { if (psel == 0) {
asm volatile( asm volatile(MMA_SP_INST
"mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x0;\n"
"{%12,%13,%14,%15}, %16, 0x0;\n" : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]), "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
"r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "f"(c[2]), "f"(c[3]), "r"(e[0]));
"r"(e[0])); asm volatile(MMA_SP_INST
asm volatile( "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
"mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " "{%12,%13,%14,%15}, %16, 0x0;\n"
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
"{%12,%13,%14,%15}, %16, 0x0;\n" : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
: "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]), "f"(c[6]), "f"(c[7]), "r"(e[0]));
"r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]),
"r"(e[0]));
} else { } else {
asm volatile( asm volatile(MMA_SP_INST
"mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " "{%12,%13,%14,%15}, %16, 0x1;\n"
"{%12,%13,%14,%15}, %16, 0x1;\n" : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
: "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]), "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
"r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]), "f"(c[2]), "f"(c[3]), "r"(e[0]));
"r"(e[0])); asm volatile(MMA_SP_INST
asm volatile( "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
"mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 " "{%12,%13,%14,%15}, %16, 0x1;\n"
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, " : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
"{%12,%13,%14,%15}, %16, 0x1;\n" : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
: "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7]) "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
: "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]), "f"(c[6]), "f"(c[7]), "r"(e[0]));
"r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]),
"r"(e[0]));
} }
} }
......
...@@ -68,6 +68,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -68,6 +68,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("gelu_fast(Tensor! out, Tensor input) -> ()"); ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
ops.impl("gelu_fast", torch::kCUDA, &gelu_fast); ops.impl("gelu_fast", torch::kCUDA, &gelu_fast);
// Quick GELU implementation.
ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
// Layernorm // Layernorm
// Apply Root Mean Square (RMS) Normalization to the input tensor. // Apply Root Mean Square (RMS) Normalization to the input tensor.
ops.def( ops.def(
...@@ -133,13 +137,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -133,13 +137,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("gptq_marlin_repack", &gptq_marlin_repack); ops.def("gptq_marlin_repack", &gptq_marlin_repack);
ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack); ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
// quantization. // quantization.
ops.def( ops.def(
"cutlass_scaled_mm_dq(Tensor! out, Tensor a," "cutlass_scaled_mm(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales," " Tensor b, Tensor a_scales,"
" Tensor b_scales) -> ()"); " Tensor b_scales, Tensor? bias) -> ()");
ops.impl("cutlass_scaled_mm_dq", torch::kCUDA, &cutlass_scaled_mm_dq); ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
// Check if cutlass scaled_mm is supported for CUDA devices of the given
// capability
ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA,
&cutlass_scaled_mm_supports_fp8);
#endif #endif
// Quantized GEMM for GPTQ. // Quantized GEMM for GPTQ.
......
sphinx == 6.2.1 sphinx==6.2.1
sphinx-book-theme == 1.0.1 sphinx-book-theme==1.0.1
sphinx-copybutton == 0.5.2 sphinx-copybutton==0.5.2
myst-parser == 2.0.0 myst-parser==2.0.0
sphinx-argparse sphinx-argparse
# packages to install to build the documentation # packages to install to build the documentation
......
<style>
.notification-bar {
width: 100vw;
display: flex;
justify-content: center;
align-items: center;
font-size: 16px;
padding: 0 6px 0 6px;
}
.notification-bar p {
margin: 0;
}
.notification-bar a {
font-weight: bold;
text-decoration: none;
}
/* Light mode styles (default) */
.notification-bar {
background-color: #fff3cd;
color: #856404;
}
.notification-bar a {
color: #d97706;
}
/* Dark mode styles */
html[data-theme=dark] .notification-bar {
background-color: #333;
color: #ddd;
}
html[data-theme=dark] .notification-bar a {
color: #ffa500; /* Brighter color for visibility */
}
</style>
<div class="notification-bar">
<p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
</div>
...@@ -5,6 +5,7 @@ vLLM Meetups ...@@ -5,6 +5,7 @@ vLLM Meetups
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
- `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__ - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
- `The second vLLM meetup <https://lu.ma/ygxbpzhl>`__, with IBM Research, January 31st 2024. `[Slides] <https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing>`__ `[Video (vLLM Update)] <https://youtu.be/Y0C-DUvEnZQ>`__ `[Video (IBM Research & torch.compile)] <https://youtu.be/m0dMtFLI-dg>`__ - `The second vLLM meetup <https://lu.ma/ygxbpzhl>`__, with IBM Research, January 31st 2024. `[Slides] <https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing>`__ `[Video (vLLM Update)] <https://youtu.be/Y0C-DUvEnZQ>`__ `[Video (IBM Research & torch.compile)] <https://youtu.be/m0dMtFLI-dg>`__
- `The first vLLM meetup <https://lu.ma/first-vllm-meetup>`__, with a16z, October 5th 2023. `[Slides] <https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing>`__ - `The first vLLM meetup <https://lu.ma/first-vllm-meetup>`__, with a16z, October 5th 2023. `[Slides] <https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing>`__
......
...@@ -22,5 +22,6 @@ vLLM is a community project. Our compute resources for development and testing a ...@@ -22,5 +22,6 @@ vLLM is a community project. Our compute resources for development and testing a
- Trainy - Trainy
- UC Berkeley - UC Berkeley
- UC San Diego - UC San Diego
- ZhenFund
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
...@@ -66,8 +66,20 @@ html_theme_options = { ...@@ -66,8 +66,20 @@ html_theme_options = {
'path_to_docs': 'docs/source', 'path_to_docs': 'docs/source',
'repository_url': 'https://github.com/vllm-project/vllm', 'repository_url': 'https://github.com/vllm-project/vllm',
'use_repository_button': True, 'use_repository_button': True,
'use_edit_page_button': True,
} }
# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
if READTHEDOCS_VERSION_TYPE == "tag":
# remove the warning banner if the version is a tagged release
header_file = os.path.join(os.path.dirname(__file__),
"_templates/sections/header.html")
# The file might be removed already if the build is triggered multiple times
# (readthedocs build both HTML and PDF versions separately)
if os.path.exists(header_file):
os.remove(header_file)
# Add any paths that contain custom static files (such as style sheets) here, # Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files, # relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css". # so a file named "default.css" will overwrite the builtin "default.css".
...@@ -95,6 +107,7 @@ autodoc_mock_imports = [ ...@@ -95,6 +107,7 @@ autodoc_mock_imports = [
'triton', 'triton',
"tqdm", "tqdm",
"tensorizer", "tensorizer",
"pynvml",
] ]
for mock_target in autodoc_mock_imports: for mock_target in autodoc_mock_imports:
......
Dockerfile Dockerfile
==================== ====================
See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_ for the main Dockerfile to construct See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`__ for the main Dockerfile to construct
the image for running an OpenAI compatible server with vLLM. the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here <https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html>`__.
- Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
- All build stages - All build stages
- The default build target (highlighted in grey) - The default build target (highlighted in grey)
- External images (with dashed borders) - External images (with dashed borders)
The edges of the build graph represent: The edges of the build graph represent:
- FROM ... dependencies (with a solid line and a full arrow head) - FROM ... dependencies (with a solid line and a full arrow head)
- COPY --from=... dependencies (with a dashed line and an empty arrow head) - COPY --from=... dependencies (with a dashed line and an empty arrow head)
- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head) - RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
.. figure:: ../../assets/dev/dockerfile-stages-dependency.png .. figure:: ../../assets/dev/dockerfile-stages-dependency.png
:alt: query :alt: query
......
.. _input_processing_pipeline:
Input Processing Pipeline
=========================
1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
2. Tokenize the data if necessary.
3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
- For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
- For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision language model.
.. _input_processing:
Input Processing
================
.. currentmodule:: vllm.inputs
Each model can override parts of vLLM's :ref:`input processing pipeline <input_processing_pipeline>` via
:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
Currently, this mechanism is only utilized in :ref:`multi-modal <multi_modality>` models for preprocessing multi-modal input
data in addition to input prompt, but it can be extended to text-only language models when needed.
Guides
++++++
.. toctree::
:maxdepth: 1
input_processing_pipeline
Module Contents
+++++++++++++++
LLM Engine Inputs
-----------------
.. autoclass:: vllm.inputs.LLMInputs
:members:
:show-inheritance:
Registry
--------
.. autodata:: vllm.inputs.INPUT_REGISTRY
.. automodule:: vllm.inputs.registry
:members:
:show-inheritance:
.. _adding_multimodal_plugin:
Adding a Multimodal Plugin
==========================
This document teaches you how to add a new modality to vLLM.
Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.
The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.
.. note::
This article is a work in progress.
..
TODO: Add more instructions on how to add new plugins once embeddings is in.
.. _multi_modality:
Multi-Modality Multi-Modality
============== ==============
...@@ -5,16 +7,21 @@ Multi-Modality ...@@ -5,16 +7,21 @@ Multi-Modality
vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package. vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
:class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data`` Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
which allows you to pass in multi-modal input alongside text and token prompts. via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptStrictInputs`.
Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
by following :ref:`this guide <adding_multimodal_plugin>`.
Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.
By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, Guides
you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`, ++++++
as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
.. contents:: .. toctree::
:local: :maxdepth: 1
:backlinks: none
adding_multimodal_plugin
Module Contents Module Contents
+++++++++++++++ +++++++++++++++
...@@ -24,9 +31,7 @@ Module Contents ...@@ -24,9 +31,7 @@ Module Contents
Registry Registry
-------- --------
.. data:: vllm.multimodal.MULTIMODAL_REGISTRY .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
The global :class:`MultiModalRegistry` which is used by model runners.
.. autoclass:: vllm.multimodal.MultiModalRegistry .. autoclass:: vllm.multimodal.MultiModalRegistry
:members: :members:
...@@ -35,7 +40,15 @@ Registry ...@@ -35,7 +40,15 @@ Registry
Base Classes Base Classes
------------ ------------
.. autoclass:: vllm.multimodal.MultiModalData .. autodata:: vllm.multimodal.BatchedTensors
.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
:members:
:show-inheritance:
.. autodata:: vllm.multimodal.MultiModalDataDict
.. autoclass:: vllm.multimodal.MultiModalInputs
:members: :members:
:show-inheritance: :show-inheritance:
......
...@@ -88,7 +88,7 @@ Option 2: Build from source ...@@ -88,7 +88,7 @@ Option 2: Build from source
- `Pytorch <https://pytorch.org/>`_ - `Pytorch <https://pytorch.org/>`_
- `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_ - `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_
For installing PyTorch, you can start from a fresh docker image, e.g, `rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`. For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started <https://pytorch.org/get-started/locally/>`_ Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started <https://pytorch.org/get-started/locally/>`_
...@@ -126,12 +126,12 @@ Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/fl ...@@ -126,12 +126,12 @@ Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/fl
$ cd vllm $ cd vllm
$ pip install -U -r requirements-rocm.txt $ pip install -U -r requirements-rocm.txt
$ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation $ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
.. tip:: .. tip::
- You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation. - You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation.
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
- To use CK flash-attention, please use this flag ``export VLLM_USE_FLASH_ATTN_TRITON=0`` to turn off triton flash attention. - To use CK flash-attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention.
- The ROCm version of pytorch, ideally, should match the ROCm driver version. - The ROCm version of pytorch, ideally, should match the ROCm driver version.
...@@ -10,6 +10,7 @@ Table of contents: ...@@ -10,6 +10,7 @@ Table of contents:
#. :ref:`Requirements <cpu_backend_requirements>` #. :ref:`Requirements <cpu_backend_requirements>`
#. :ref:`Quick start using Dockerfile <cpu_backend_quick_start_dockerfile>` #. :ref:`Quick start using Dockerfile <cpu_backend_quick_start_dockerfile>`
#. :ref:`Build from source <build_cpu_backend_from_source>` #. :ref:`Build from source <build_cpu_backend_from_source>`
#. :ref:`Intel Extension for PyTorch <ipex_guidance>`
#. :ref:`Performance tips <cpu_backend_performance_tips>` #. :ref:`Performance tips <cpu_backend_performance_tips>`
.. _cpu_backend_requirements: .. _cpu_backend_requirements:
...@@ -18,8 +19,8 @@ Requirements ...@@ -18,8 +19,8 @@ Requirements
------------ ------------
* OS: Linux * OS: Linux
* Compiler: gcc/g++>=12.3.0 (recommended) * Compiler: gcc/g++>=12.3.0 (optional, recommended)
* Instruction set architecture (ISA) requirement: AVX512 is required. * Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
.. _cpu_backend_quick_start_dockerfile: .. _cpu_backend_quick_start_dockerfile:
...@@ -41,7 +42,7 @@ Quick start using Dockerfile ...@@ -41,7 +42,7 @@ Quick start using Dockerfile
Build from source Build from source
----------------- -----------------
- First, install required compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: - First, install recommended compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
.. code-block:: console .. code-block:: console
...@@ -70,6 +71,15 @@ Build from source ...@@ -70,6 +71,15 @@ Build from source
- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.
.. _ipex_guidance:
Intel Extension for PyTorch
---------------------------
- `Intel Extension for PyTorch (IPEX) <https://github.com/intel/intel-extension-for-pytorch>`_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
- IPEX after the ``2.3.0`` can be enabled in the CPU backend by default if it is installed.
.. _cpu_backend_performance_tips: .. _cpu_backend_performance_tips:
Performance tips Performance tips
...@@ -77,6 +87,15 @@ Performance tips ...@@ -77,6 +87,15 @@ Performance tips
- vLLM CPU backend uses environment variable ``VLLM_CPU_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - vLLM CPU backend uses environment variable ``VLLM_CPU_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
.. code-block:: console
$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
$ find / -name *libtcmalloc* # find the dynamic link library path
$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
$ python examples/offline_inference.py # run vLLM
- vLLM CPU backend uses OpenMP for thread-parallel computation. If you want the best performance on CPU, it will be very critical to isolate CPU cores for OpenMP threads with other thread pools (like web-service event-loop), to avoid CPU oversubscription. - vLLM CPU backend uses OpenMP for thread-parallel computation. If you want the best performance on CPU, it will be very critical to isolate CPU cores for OpenMP threads with other thread pools (like web-service event-loop), to avoid CPU oversubscription.
- If using vLLM CPU backend on a bare-metal machine, it is recommended to disable the hyper-threading. - If using vLLM CPU backend on a bare-metal machine, it is recommended to disable the hyper-threading.
......
...@@ -8,35 +8,62 @@ Debugging hang/crash issues ...@@ -8,35 +8,62 @@ Debugging hang/crash issues
When an vLLM instance hangs or crashes, it is very difficult to debug the issue. But wait a minute, it is also possible that vLLM is doing something that indeed takes a long time: When an vLLM instance hangs or crashes, it is very difficult to debug the issue. But wait a minute, it is also possible that vLLM is doing something that indeed takes a long time:
- Downloading a model: do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and then use the local path to the model. This way, you can isolate the issue. - **Downloading a model**: Do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and then use the local path to the model. This way, you can isolate the issue.
- Loading the model from disk: if the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory. - **Loading the model from disk**: If the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory.
- Tensor parallel inference: if the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. - **Tensor parallel inference**: If the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
If you already take care of the above issues, and the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue: If you have already taken care of the above issues, but the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue:
- Set the environment variable ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging. - Set the environment variable ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
- Set the environment variable ``export CUDA_LAUNCH_BLOCKING=1`` to know exactly which CUDA kernel is causing the trouble. - Set the environment variable ``export CUDA_LAUNCH_BLOCKING=1`` to know exactly which CUDA kernel is causing the trouble.
- Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL. - Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
- Set the environment variable ``export VLLM_TRACE_FUNCTION=1`` . All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs. **Note: it will generate a lot of logs and slow down the system. Only use it for debugging purposes.** - Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs.
.. warning::
vLLM function tracing will generate a lot of logs and slow down the system. Only use it for debugging purposes.
With more logging, hopefully you can find the root cause of the issue. With more logging, hopefully you can find the root cause of the issue.
If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
Here are some common issues that can cause hangs: Here are some common issues that can cause hangs:
- The network setup is incorrect. The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. - **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address.
- Hardware/driver setup is incorrect. GPU communication cannot be established. You can run a sanity check script below to see if the GPU communication is working correctly. - **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly.
.. code-block:: python .. code-block:: python
# save it as `test.py`` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py`
# adjust `--nproc-per-node` to the number of GPUs you want to use.
import torch import torch
import torch.distributed as dist import torch.distributed as dist
dist.init_process_group(backend="nccl") dist.init_process_group(backend="nccl")
data = torch.FloatTensor([1,] * 128).to(f"cuda:{dist.get_rank()}") local_rank = dist.get_rank() % torch.cuda.device_count()
data = torch.FloatTensor([1,] * 128).to(f"cuda:{local_rank}")
dist.all_reduce(data, op=dist.ReduceOp.SUM) dist.all_reduce(data, op=dist.ReduceOp.SUM)
torch.cuda.synchronize() torch.cuda.synchronize()
value = data.mean().item() value = data.mean().item()
assert value == dist.get_world_size() world_size = dist.get_world_size()
assert value == world_size, f"Expected {world_size}, got {value}"
gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
cpu_data = torch.FloatTensor([1,] * 128)
dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
value = cpu_data.mean().item()
assert value == world_size, f"Expected {world_size}, got {value}"
print("sanity check is successful!")
.. tip::
Save the script as ``test.py``.
If you are testing in a single-node, run it with ``NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py``, adjust ``--nproc-per-node`` to the number of GPUs you want to use.
If you are testing with multi-nodes, run it with ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py``. Adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup. Make sure ``MASTER_ADDR``:
- is the correct IP address of the master node
- is reachable from all nodes
- is set before running the script.
If the script runs successfully, you should see the message ``sanity check is successful!``.
If the problem persists, feel free to open an `issue <https://github.com/vllm-project/vllm/issues/new/choose>`_ on GitHub, with a detailed description of the issue, your environment, and the logs. If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
...@@ -20,7 +20,7 @@ You can install vLLM using pip: ...@@ -20,7 +20,7 @@ You can install vLLM using pip:
.. code-block:: console .. code-block:: console
$ # (Recommended) Create a new conda environment. $ # (Recommended) Create a new conda environment.
$ conda create -n myenv python=3.9 -y $ conda create -n myenv python=3.10 -y
$ conda activate myenv $ conda activate myenv
$ # Install vLLM with CUDA 12.1. $ # Install vLLM with CUDA 12.1.
...@@ -35,13 +35,27 @@ You can install vLLM using pip: ...@@ -35,13 +35,27 @@ You can install vLLM using pip:
$ # Install vLLM with CUDA 11.8. $ # Install vLLM with CUDA 11.8.
$ export VLLM_VERSION=0.4.0 $ export VLLM_VERSION=0.4.0
$ export PYTHON_VERSION=39 $ export PYTHON_VERSION=310
$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
.. note::
vLLM also publishes a subset of wheels (Python 3.10, 3.11 with CUDA 12) for every commit since v0.5.3. You can download them with the following command:
.. code-block:: console
$ export VLLM_VERSION=0.5.2 # vLLM's main branch version is currently set to latest released tag
$ export PYTHON_VERSION=310
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl
$ # You can also access a specific commit
$ # export VLLM_COMMIT=...
$ # pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-${VLLM_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl
.. _build_from_source: .. _build_from_source:
Build from source Build from source
......
.. _installation_openvino:
Installation with OpenVINO
==========================
vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features:
- Prefix caching (``--enable-prefix-caching``)
- Chunked prefill (``--enable-chunked-prefill``)
**Table of contents**:
- :ref:`Requirements <openvino_backend_requirements>`
- :ref:`Quick start using Dockerfile <openvino_backend_quick_start_dockerfile>`
- :ref:`Build from source <install_openvino_backend_from_source>`
- :ref:`Performance tips <openvino_backend_performance_tips>`
- :ref:`Limitations <openvino_backend_limitations>`
.. _openvino_backend_requirements:
Requirements
------------
* OS: Linux
* Instruction set architecture (ISA) requirement: at least AVX2.
.. _openvino_backend_quick_start_dockerfile:
Quick start using Dockerfile
----------------------------
.. code-block:: console
$ docker build -f Dockerfile.openvino -t vllm-openvino-env .
$ docker run -it --rm vllm-openvino-env
.. _install_openvino_backend_from_source:
Install from source
-------------------
- First, install Python. For example, on Ubuntu 22.04, you can run:
.. code-block:: console
$ sudo apt-get update -y
$ sudo apt-get install python3
- Second, install prerequisites vLLM OpenVINO backend installation:
.. code-block:: console
$ pip install --upgrade pip
$ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
- Finally, install vLLM with OpenVINO backend:
.. code-block:: console
$ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
.. _openvino_backend_performance_tips:
Performance tips
----------------
vLLM OpenVINO backend uses the following environment variables to control behavior:
- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off.
To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``)
OpenVINO best known configuration is:
.. code-block:: console
$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
.. _openvino_backend_limitations:
Limitations
-----------
- LoRA serving is not supported.
- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
- Speculative sampling is not tested within vLLM integration.
.. _installation_tpu:
Installation with TPU
=====================
vLLM supports Google Cloud TPUs using PyTorch XLA.
Requirements
------------
* Google Cloud TPU VM (single host)
* TPU versions: v5e, v5p, v4
* Python: 3.10
Installation options:
1. :ref:`Build a docker image with Dockerfile <build_docker_tpu>`.
2. :ref:`Build from source <build_from_source_tpu>`.
.. _build_docker_tpu:
Build a docker image with :code:`Dockerfile.tpu`
------------------------------------------------
`Dockerfile.tpu <https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu>`_ is provided to build a docker image with TPU support.
.. code-block:: console
$ docker build -f Dockerfile.tpu -t vllm-tpu .
You can run the docker image with the following command:
.. code-block:: console
$ # Make sure to add `--privileged --net host --shm-size=16G`.
$ docker run --privileged --net host --shm-size=16G -it vllm-tpu
.. _build_from_source_tpu:
Build from source
-----------------
You can also build and install the TPU backend from source.
First, install the dependencies:
.. code-block:: console
$ # (Recommended) Create a new conda environment.
$ conda create -n myenv python=3.10 -y
$ conda activate myenv
$ # Clean up the existing torch and torch-xla packages.
$ pip uninstall torch torch-xla -y
$ # Install PyTorch and PyTorch XLA.
$ export DATE="+20240601"
$ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl
$ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl
$ # Install JAX and Pallas.
$ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
$ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
$ # Install other build dependencies.
$ pip install packaging aiohttp
Next, build vLLM from source. This will only take a few seconds:
.. code-block:: console
$ VLLM_TARGET_DEVICE="tpu" python setup.py develop
.. tip::
If you encounter the following error:
.. code-block:: console
from torch._C import * # noqa: F403
ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory
You can install OpenBLAS with the following command:
.. code-block:: console
$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
.. _installation_xpu:
Installation with XPU
========================
vLLM initially supports basic model inferencing and serving on Intel GPU platform.
Table of contents:
#. :ref:`Requirements <xpu_backend_requirements>`
#. :ref:`Quick start using Dockerfile <xpu_backend_quick_start_dockerfile>`
#. :ref:`Build from source <build_xpu_backend_from_source>`
.. _xpu_backend_requirements:
Requirements
------------
* OS: Linux
* Supported Hardware: Intel Data Center GPU (Intel ARC GPU WIP)
* OneAPI requirements: oneAPI 2024.1
.. _xpu_backend_quick_start_dockerfile:
Quick start using Dockerfile
----------------------------
.. code-block:: console
$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
$ docker run -it \
--rm \
--network=host \
--device /dev/dri \
-v /dev/dri/by-path:/dev/dri/by-path \
vllm-xpu-env
.. _build_xpu_backend_from_source:
Build from source
-----------------
- First, install required driver and intel OneAPI 2024.1 or later.
- Second, install Python packages for vLLM XPU backend building:
.. code-block:: console
$ source /opt/intel/oneapi/setvars.sh
$ pip install --upgrade pip
$ pip install -v -r requirements-xpu.txt
- Finally, build and install vLLM XPU backend:
.. code-block:: console
$ VLLM_TARGET_DEVICE=xpu python setup.py install
.. note::
- FP16 is the default data type in the current XPU backend. The BF16 data
type will be supported in the future.
...@@ -38,7 +38,7 @@ vLLM is flexible and easy to use with: ...@@ -38,7 +38,7 @@ vLLM is flexible and easy to use with:
* Seamless integration with popular HuggingFace models * Seamless integration with popular HuggingFace models
* High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more * High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
* Tensor parallelism support for distributed inference * Tensor parallelism and pipeline parallelism support for distributed inference
* Streaming outputs * Streaming outputs
* OpenAI-compatible API server * OpenAI-compatible API server
* Support NVIDIA GPUs and AMD GPUs * Support NVIDIA GPUs and AMD GPUs
...@@ -63,8 +63,11 @@ Documentation ...@@ -63,8 +63,11 @@ Documentation
getting_started/installation getting_started/installation
getting_started/amd-installation getting_started/amd-installation
getting_started/neuron-installation getting_started/openvino-installation
getting_started/cpu-installation getting_started/cpu-installation
getting_started/neuron-installation
getting_started/tpu-installation
getting_started/xpu-installation
getting_started/quickstart getting_started/quickstart
getting_started/debugging getting_started/debugging
getting_started/examples/examples_index getting_started/examples/examples_index
...@@ -80,6 +83,8 @@ Documentation ...@@ -80,6 +83,8 @@ Documentation
serving/env_vars serving/env_vars
serving/usage_stats serving/usage_stats
serving/integrations serving/integrations
serving/tensorizer
serving/faq
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
...@@ -87,6 +92,7 @@ Documentation ...@@ -87,6 +92,7 @@ Documentation
models/supported_models models/supported_models
models/adding_model models/adding_model
models/enabling_multimodal_inputs
models/engine_args models/engine_args
models/lora models/lora
models/vlm models/vlm
...@@ -97,6 +103,7 @@ Documentation ...@@ -97,6 +103,7 @@ Documentation
:maxdepth: 1 :maxdepth: 1
:caption: Quantization :caption: Quantization
quantization/supported_hardware
quantization/auto_awq quantization/auto_awq
quantization/fp8 quantization/fp8
quantization/fp8_e5m2_kvcache quantization/fp8_e5m2_kvcache
...@@ -110,12 +117,14 @@ Documentation ...@@ -110,12 +117,14 @@ Documentation
automatic_prefix_caching/details automatic_prefix_caching/details
.. toctree:: .. toctree::
:maxdepth: 2
:caption: Developer Documentation :caption: Developer Documentation
dev/sampling_params dev/sampling_params
dev/offline_inference/offline_index dev/offline_inference/offline_index
dev/engine/engine_index dev/engine/engine_index
dev/kernel/paged_attention dev/kernel/paged_attention
dev/input_processing/model_inputs_index
dev/multimodal/multimodal_index dev/multimodal/multimodal_index
dev/dockerfile/dockerfile dev/dockerfile/dockerfile
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment