Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

705f6a35 · zhuwenwen · af837396 · 4cf256ae · 705f6a35 · 705f6a35
Commit 705f6a35 authored Jul 16, 2024 by zhuwenwen
20 changed files
--- a/csrc/quantization/marlin/sparse/common/mma.h
+++ b/csrc/quantization/marlin/sparse/common/mma.h
@@ -17,9 +17,23 @@
 #pragma once
 #include "base.h"
+#include <cudaTypedefs.h>
 namespace marlin_24 {
+// On CUDA earlier than 12.5, the ordered_metadata version of this instruction
+// is not supported. On later versions of CUDA the version without ordered
+// metadata results in the following warning:
+//  | Advisory: Modifier ‘.sp::ordered_metadata’ should be used on instruction
+//  | ‘mma’ instead of modifier ‘.sp’ as it is expected to have substantially
+//  | reduced performance on some future architectures
+#if defined CUDA_VERSION && CUDA_VERSION >= 12050
+  #define MMA_SP_INST \
+    "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+#else
+  #define MMA_SP_INST "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+#endif
 // m16n8k32 sparse tensor core mma instruction with fp16 inputs and fp32
 // output/accumulation.
 __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
@@ -29,41 +43,38 @@ __device__ inline void mma_sp(const FragB& a_frag0, const FragB& a_frag1,
  const uint32_t* a1 = reinterpret_cast<const uint32_t*>(&a_frag1);
  const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
  const uint32_t* e = reinterpret_cast<const uint32_t*>(&frag_m);
  float* c = reinterpret_cast<float*>(&frag_c);
  if (psel == 0) {
-    asm volatile(
+    asm volatile(MMA_SP_INST
-        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
-        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x0;\n"
-        "{%12,%13,%14,%15}, %16, 0x0;\n"
+                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
-        : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]),
+                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
-          "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]),
+                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
-          "r"(e[0]));
+    asm volatile(MMA_SP_INST
-    asm volatile(
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
-        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+                 "{%12,%13,%14,%15}, %16, 0x0;\n"
-        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
-        "{%12,%13,%14,%15}, %16, 0x0;\n"
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
-        : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
+                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
-        : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]),
+                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
-          "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]),
-          "r"(e[0]));
  } else {
-    asm volatile(
+    asm volatile(MMA_SP_INST
-        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
-        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 "{%12,%13,%14,%15}, %16, 0x1;\n"
-        "{%12,%13,%14,%15}, %16, 0x1;\n"
+                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]),
-        : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[0]), "r"(b[2]),
+                   "r"(b[2]), "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]),
-          "r"(b[4]), "r"(b[6]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]),
+                   "f"(c[2]), "f"(c[3]), "r"(e[0]));
-          "r"(e[0]));
+    asm volatile(MMA_SP_INST
-    asm volatile(
+                 "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
-        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 "
+                 "{%12,%13,%14,%15}, %16, 0x1;\n"
-        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9, %10,%11}, "
+                 : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
-        "{%12,%13,%14,%15}, %16, 0x1;\n"
+                 : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]),
-        : "=f"(c[4]), "=f"(c[5]), "=f"(c[6]), "=f"(c[7])
+                   "r"(b[3]), "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]),
-        : "r"(a0[0]), "r"(a1[0]), "r"(a0[1]), "r"(a1[1]), "r"(b[1]), "r"(b[3]),
+                   "f"(c[6]), "f"(c[7]), "r"(e[0]));
-          "r"(b[5]), "r"(b[7]), "f"(c[4]), "f"(c[5]), "f"(c[6]), "f"(c[7]),
-          "r"(e[0]));
  }
 }

--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -68,6 +68,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
  ops.impl("gelu_fast", torch::kCUDA, &gelu_fast);
+  // Quick GELU implementation.
+  ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
  // Layernorm
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
  ops.def(
@@ -133,13 +137,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("gptq_marlin_repack", &gptq_marlin_repack);
  ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
+  // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
+  ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
+  ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
  // quantization.
  ops.def(
-      "cutlass_scaled_mm_dq(Tensor! out, Tensor a,"
+      "cutlass_scaled_mm(Tensor! out, Tensor a,"
-      "                     Tensor b, Tensor a_scales,"
+      "                  Tensor b, Tensor a_scales,"
-      "                     Tensor b_scales) -> ()");
+      "                  Tensor b_scales, Tensor? bias) -> ()");
-  ops.impl("cutlass_scaled_mm_dq", torch::kCUDA, &cutlass_scaled_mm_dq);
+  ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
+  // Check if cutlass scaled_mm is supported for CUDA devices of the given
+  // capability
+  ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
+  ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA,
+           &cutlass_scaled_mm_supports_fp8);
 #endif
  // Quantized GEMM for GPTQ.

--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
-sphinx == 6.2.1
+sphinx==6.2.1
-sphinx-book-theme == 1.0.1
+sphinx-book-theme==1.0.1
-sphinx-copybutton == 0.5.2
+sphinx-copybutton==0.5.2
-myst-parser == 2.0.0
+myst-parser==2.0.0
 sphinx-argparse
 # packages to install to build the documentation

--- a/docs/source/_templates/sections/header.html
+++ b/docs/source/_templates/sections/header.html
+<style>
+  .notification-bar {
+    width: 100vw;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    font-size: 16px;
+    padding: 0 6px 0 6px;
+  }
+  .notification-bar p {
+    margin: 0;
+  }
+  .notification-bar a {
+    font-weight: bold;
+    text-decoration: none;
+  }
+  /* Light mode styles (default) */
+  .notification-bar {
+    background-color: #fff3cd;
+    color: #856404;
+  }
+  .notification-bar a {
+    color: #d97706;
+  }
+  /* Dark mode styles */
+  html[data-theme=dark] .notification-bar {
+    background-color: #333;
+    color: #ddd;
+  }
+  html[data-theme=dark] .notification-bar a {
+    color: #ffa500; /* Brighter color for visibility */
+  }
+</style>
+<div class="notification-bar">
+  <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
+</div>
--- a/docs/source/community/meetups.rst
+++ b/docs/source/community/meetups.rst
@@ -5,6 +5,7 @@ vLLM Meetups
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
+- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
 - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
 - `The second vLLM meetup <https://lu.ma/ygxbpzhl>`__, with IBM Research, January 31st 2024. `[Slides] <https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing>`__ `[Video (vLLM Update)] <https://youtu.be/Y0C-DUvEnZQ>`__ `[Video (IBM Research & torch.compile)] <https://youtu.be/m0dMtFLI-dg>`__
 - `The first vLLM meetup <https://lu.ma/first-vllm-meetup>`__, with a16z, October 5th 2023. `[Slides] <https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing>`__

--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -22,5 +22,6 @@ vLLM is a community project. Our compute resources for development and testing a
 - Trainy
 - UC Berkeley
 - UC San Diego
+- ZhenFund
 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -66,8 +66,20 @@ html_theme_options = {
    'path_to_docs': 'docs/source',
    'repository_url': 'https://github.com/vllm-project/vllm',
    'use_repository_button': True,
+    'use_edit_page_button': True,
 }
+# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
+READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
+if READTHEDOCS_VERSION_TYPE == "tag":
+    # remove the warning banner if the version is a tagged release
+    header_file = os.path.join(os.path.dirname(__file__),
+                               "_templates/sections/header.html")
+    # The file might be removed already if the build is triggered multiple times
+    # (readthedocs build both HTML and PDF versions separately)
+    if os.path.exists(header_file):
+        os.remove(header_file)
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
@@ -95,6 +107,7 @@ autodoc_mock_imports = [
    'triton',
    "tqdm",
    "tensorizer",
+    "pynvml",
 ]
 for mock_target in autodoc_mock_imports:

--- a/docs/source/dev/dockerfile/dockerfile.rst
+++ b/docs/source/dev/dockerfile/dockerfile.rst
 Dockerfile
 ====================
-See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_ for the main Dockerfile to construct 
+See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`__ for the main Dockerfile to construct 
-the image for running an OpenAI compatible server with vLLM.
+the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here <https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html>`__.
-  Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
+Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
-   - All build stages
+- All build stages
-   - The default build target (highlighted in grey)
+- The default build target (highlighted in grey)
-   - External images (with dashed borders)
+- External images (with dashed borders)
-   The edges of the build graph represent:
+The edges of the build graph represent:
-   - FROM ... dependencies (with a solid line and a full arrow head)
+- FROM ... dependencies (with a solid line and a full arrow head)
-   - COPY --from=... dependencies (with a dashed line and an empty arrow head)
+- COPY --from=... dependencies (with a dashed line and an empty arrow head)
-   - RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
+- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
   .. figure:: ../../assets/dev/dockerfile-stages-dependency.png
      :alt: query

--- a/docs/source/dev/input_processing/input_processing_pipeline.rst
+++ b/docs/source/dev/input_processing/input_processing_pipeline.rst
+.. _input_processing_pipeline:
+Input Processing Pipeline
+=========================
+1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
+2. Tokenize the data if necessary.
+3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
+   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
+4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
+5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
+6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
+   - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision language model.
--- a/docs/source/dev/input_processing/model_inputs_index.rst
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
+.. _input_processing:
+Input Processing
+================
+.. currentmodule:: vllm.inputs
+Each model can override parts of vLLM's :ref:`input processing pipeline <input_processing_pipeline>` via
+:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+Currently, this mechanism is only utilized in :ref:`multi-modal <multi_modality>` models for preprocessing multi-modal input 
+data in addition to input prompt, but it can be extended to text-only language models when needed.
+Guides
++++++
+.. toctree::
+   :maxdepth: 1
+   input_processing_pipeline
+Module Contents
+++++++++++++++
+LLM Engine Inputs
+-----------------
+.. autoclass:: vllm.inputs.LLMInputs
+    :members:
+    :show-inheritance:
+Registry
+--------
+.. autodata:: vllm.inputs.INPUT_REGISTRY
+.. automodule:: vllm.inputs.registry
+    :members:
+    :show-inheritance:
--- a/docs/source/dev/multimodal/adding_multimodal_plugin.rst
+++ b/docs/source/dev/multimodal/adding_multimodal_plugin.rst
+.. _adding_multimodal_plugin:
+Adding a Multimodal Plugin
+==========================
+This document teaches you how to add a new modality to vLLM.
+Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.
+The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.
+.. note::
+  This article is a work in progress.
+..
+  TODO: Add more instructions on how to add new plugins once embeddings is in.
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
+.. _multi_modality:
 Multi-Modality
 ==============
@@ -5,16 +7,21 @@ Multi-Modality
 vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
-:class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
+Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
-which allows you to pass in multi-modal input alongside text and token prompts.
+via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptStrictInputs`.
+Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
+by following :ref:`this guide <adding_multimodal_plugin>`.
+Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.
-By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
+Guides
-you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
++++++
-as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
-.. contents::
+.. toctree::
-   :local:
+   :maxdepth: 1
-   :backlinks: none
+   adding_multimodal_plugin
 Module Contents
 +++++++++++++++
@@ -24,9 +31,7 @@ Module Contents
 Registry
 --------
-.. data:: vllm.multimodal.MULTIMODAL_REGISTRY
+.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
-    The global :class:`MultiModalRegistry` which is used by model runners.
 .. autoclass:: vllm.multimodal.MultiModalRegistry
    :members:
@@ -35,7 +40,15 @@ Registry
 Base Classes
 ------------
-.. autoclass:: vllm.multimodal.MultiModalData
+.. autodata:: vllm.multimodal.BatchedTensors
+.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
+    :members:
+    :show-inheritance:
+.. autodata:: vllm.multimodal.MultiModalDataDict
+.. autoclass:: vllm.multimodal.MultiModalInputs
    :members:
    :show-inheritance:

--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -88,7 +88,7 @@ Option 2: Build from source
 - `Pytorch <https://pytorch.org/>`_
 - `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_
-For installing PyTorch, you can start from a fresh docker image, e.g, `rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
+For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
 Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started <https://pytorch.org/get-started/locally/>`_
@@ -126,12 +126,12 @@ Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/fl
    $ cd vllm
    $ pip install -U -r requirements-rocm.txt
-    $ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
+    $ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
 .. tip::
    - You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation.
    - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
-    - To use CK flash-attention, please use this flag ``export VLLM_USE_FLASH_ATTN_TRITON=0`` to turn off triton flash attention. 
+    - To use CK flash-attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. 
    - The ROCm version of pytorch, ideally, should match the ROCm driver version.
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -10,6 +10,7 @@ Table of contents:
 #. :ref:`Requirements <cpu_backend_requirements>`
 #. :ref:`Quick start using Dockerfile <cpu_backend_quick_start_dockerfile>`
 #. :ref:`Build from source <build_cpu_backend_from_source>`
+#. :ref:`Intel Extension for PyTorch <ipex_guidance>`
 #. :ref:`Performance tips <cpu_backend_performance_tips>`
 .. _cpu_backend_requirements:
@@ -18,8 +19,8 @@ Requirements
 ------------
 * OS: Linux
-* Compiler: gcc/g++>=12.3.0 (recommended)
+* Compiler: gcc/g++>=12.3.0 (optional, recommended)
-* Instruction set architecture (ISA) requirement: AVX512 is required.
+* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
 .. _cpu_backend_quick_start_dockerfile:
@@ -41,7 +42,7 @@ Quick start using Dockerfile
 Build from source
 -----------------
- First, install required compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+- First, install recommended compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
 .. code-block:: console
@@ -70,6 +71,15 @@ Build from source
    - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.    
+.. _ipex_guidance:
+Intel Extension for PyTorch
+---------------------------
+- `Intel Extension for PyTorch (IPEX) <https://github.com/intel/intel-extension-for-pytorch>`_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+- IPEX after the ``2.3.0`` can be enabled in the CPU backend by default if it is installed.
 .. _cpu_backend_performance_tips:
 Performance tips
@@ -77,6 +87,15 @@ Performance tips
 - vLLM CPU backend uses environment variable ``VLLM_CPU_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run:
+.. code-block:: console
+    $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
+    $ find / -name *libtcmalloc* # find the dynamic link library path
+    $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
+    $ python examples/offline_inference.py # run vLLM
 - vLLM CPU backend uses OpenMP for thread-parallel computation. If you want the best performance on CPU, it will be very critical to isolate CPU cores for OpenMP threads with other thread pools (like web-service event-loop), to avoid CPU oversubscription. 
 - If using vLLM CPU backend on a bare-metal machine, it is recommended to disable the hyper-threading.

--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -8,35 +8,62 @@ Debugging hang/crash issues
 When an vLLM instance hangs or crashes, it is very difficult to debug the issue. But wait a minute, it is also possible that vLLM is doing something that indeed takes a long time:
- Downloading a model: do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and then use the local path to the model. This way, you can isolate the issue.
+- **Downloading a model**: Do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and then use the local path to the model. This way, you can isolate the issue.
- Loading the model from disk: if the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory.
+- **Loading the model from disk**: If the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory.
- Tensor parallel inference: if the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+- **Tensor parallel inference**: If the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
-If you already take care of the above issues, and the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue:
+If you have already taken care of the above issues, but the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue:
 - Set the environment variable ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
 - Set the environment variable ``export CUDA_LAUNCH_BLOCKING=1`` to know exactly which CUDA kernel is causing the trouble.
 - Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
- Set the environment variable ``export VLLM_TRACE_FUNCTION=1`` . All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs. **Note: it will generate a lot of logs and slow down the system. Only use it for debugging purposes.**
+- Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs.
+  .. warning::
+    vLLM function tracing will generate a lot of logs and slow down the system. Only use it for debugging purposes.
 With more logging, hopefully you can find the root cause of the issue.
+If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
 Here are some common issues that can cause hangs:
- The network setup is incorrect. The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``.
+- **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address.
- Hardware/driver setup is incorrect. GPU communication cannot be established. You can run a sanity check script below to see if the GPU communication is working correctly.
+- **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly.
 .. code-block:: python
-    # save it as `test.py`` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py`
-    # adjust `--nproc-per-node` to the number of GPUs you want to use.
    import torch
    import torch.distributed as dist
    dist.init_process_group(backend="nccl")
-    data = torch.FloatTensor([1,] * 128).to(f"cuda:{dist.get_rank()}")
+    local_rank = dist.get_rank() % torch.cuda.device_count()
+    data = torch.FloatTensor([1,] * 128).to(f"cuda:{local_rank}")
    dist.all_reduce(data, op=dist.ReduceOp.SUM)
    torch.cuda.synchronize()
    value = data.mean().item()
-    assert value == dist.get_world_size()
+    world_size = dist.get_world_size()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+    gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
+    cpu_data = torch.FloatTensor([1,] * 128)
+    dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
+    value = cpu_data.mean().item()
+    assert value == world_size, f"Expected {world_size}, got {value}"
+    print("sanity check is successful!")
+.. tip::
+    Save the script as ``test.py``.
+    If you are testing in a single-node, run it with ``NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py``, adjust ``--nproc-per-node`` to the number of GPUs you want to use.
+    If you are testing with multi-nodes, run it with ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py``. Adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup. Make sure ``MASTER_ADDR``:
+    - is the correct IP address of the master node
+    - is reachable from all nodes
+    - is set before running the script.
+    If the script runs successfully, you should see the message ``sanity check is successful!``.
-If the problem persists, feel free to open an `issue <https://github.com/vllm-project/vllm/issues/new/choose>`_ on GitHub, with a detailed description of the issue, your environment, and the logs.
+If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -20,7 +20,7 @@ You can install vLLM using pip:
 .. code-block:: console
    $ # (Recommended) Create a new conda environment.
-    $ conda create -n myenv python=3.9 -y
+    $ conda create -n myenv python=3.10 -y
    $ conda activate myenv
    $ # Install vLLM with CUDA 12.1.
@@ -35,13 +35,27 @@ You can install vLLM using pip:
        $ # Install vLLM with CUDA 11.8.
        $ export VLLM_VERSION=0.4.0
-        $ export PYTHON_VERSION=39
+        $ export PYTHON_VERSION=310
        $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
    In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
    Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
+.. note::
+    vLLM also publishes a subset of wheels (Python 3.10, 3.11 with CUDA 12) for every commit since v0.5.3. You can download them with the following command:
+    .. code-block:: console
+        $ export VLLM_VERSION=0.5.2 # vLLM's main branch version is currently set to latest released tag
+        $ export PYTHON_VERSION=310
+        $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl
+        $ # You can also access a specific commit
+        $ # export VLLM_COMMIT=...
+        $ # pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-${VLLM_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl
 .. _build_from_source:
 Build from source

--- a/docs/source/getting_started/openvino-installation.rst
+++ b/docs/source/getting_started/openvino-installation.rst
+.. _installation_openvino:
+Installation with OpenVINO
+==========================
+vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features:
+- Prefix caching (``--enable-prefix-caching``)
+- Chunked prefill (``--enable-chunked-prefill``)
+**Table of contents**:
+- :ref:`Requirements <openvino_backend_requirements>`
+- :ref:`Quick start using Dockerfile <openvino_backend_quick_start_dockerfile>`
+- :ref:`Build from source <install_openvino_backend_from_source>`
+- :ref:`Performance tips <openvino_backend_performance_tips>`
+- :ref:`Limitations <openvino_backend_limitations>`
+.. _openvino_backend_requirements:
+Requirements
+------------
+* OS: Linux
+* Instruction set architecture (ISA) requirement: at least AVX2.
+.. _openvino_backend_quick_start_dockerfile:
+Quick start using Dockerfile
+----------------------------
+.. code-block:: console
+    $ docker build -f Dockerfile.openvino -t vllm-openvino-env .
+    $ docker run -it --rm vllm-openvino-env
+.. _install_openvino_backend_from_source:
+Install from source
+-------------------
+- First, install Python. For example, on Ubuntu 22.04, you can run:
+  .. code-block:: console
+      $ sudo apt-get update  -y
+      $ sudo apt-get install python3
+- Second, install prerequisites vLLM OpenVINO backend installation:
+  .. code-block:: console
+      $ pip install --upgrade pip
+      $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+- Finally, install vLLM with OpenVINO backend: 
+  .. code-block:: console
+      $ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+.. _openvino_backend_performance_tips:
+Performance tips
+----------------
+vLLM OpenVINO backend uses the following environment variables to control behavior:
+- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
+- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off.
+To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``)
+OpenVINO best known configuration is:
+.. code-block:: console
+    $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+        python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
+.. _openvino_backend_limitations:
+Limitations
+-----------
+- LoRA serving is not supported.
+- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
+- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
+- Speculative sampling is not tested within vLLM integration.
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
+.. _installation_tpu:
+Installation with TPU
+=====================
+vLLM supports Google Cloud TPUs using PyTorch XLA.
+Requirements
+------------
+* Google Cloud TPU VM (single host)
+* TPU versions: v5e, v5p, v4
+* Python: 3.10
+Installation options:
+1. :ref:`Build a docker image with Dockerfile <build_docker_tpu>`.
+2. :ref:`Build from source <build_from_source_tpu>`.
+.. _build_docker_tpu:
+Build a docker image with :code:`Dockerfile.tpu`
+------------------------------------------------
+`Dockerfile.tpu <https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu>`_ is provided to build a docker image with TPU support.
+.. code-block:: console
+    $ docker build -f Dockerfile.tpu -t vllm-tpu .
+You can run the docker image with the following command:
+.. code-block:: console
+    $ # Make sure to add `--privileged --net host --shm-size=16G`.
+    $ docker run --privileged --net host --shm-size=16G -it vllm-tpu
+.. _build_from_source_tpu:
+Build from source
+-----------------
+You can also build and install the TPU backend from source.
+First, install the dependencies:
+.. code-block:: console
+    $ # (Recommended) Create a new conda environment.
+    $ conda create -n myenv python=3.10 -y
+    $ conda activate myenv
+    $ # Clean up the existing torch and torch-xla packages.
+    $ pip uninstall torch torch-xla -y
+    $ # Install PyTorch and PyTorch XLA.
+    $ export DATE="+20240601"
+    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl
+    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl
+    $ # Install JAX and Pallas.
+    $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+    $ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+    $ # Install other build dependencies.
+    $ pip install packaging aiohttp
+Next, build vLLM from source. This will only take a few seconds:
+.. code-block:: console
+    $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
+.. tip::
+    If you encounter the following error:
+    .. code-block:: console
+        from torch._C import *  # noqa: F403
+        ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory
+    You can install OpenBLAS with the following command:
+    .. code-block:: console
+        $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
--- a/docs/source/getting_started/xpu-installation.rst
+++ b/docs/source/getting_started/xpu-installation.rst
+.. _installation_xpu:
+Installation with XPU
+========================
+vLLM initially supports basic model inferencing and serving on Intel GPU platform.
+Table of contents:
+#. :ref:`Requirements <xpu_backend_requirements>`
+#. :ref:`Quick start using Dockerfile <xpu_backend_quick_start_dockerfile>`
+#. :ref:`Build from source <build_xpu_backend_from_source>`
+.. _xpu_backend_requirements:
+Requirements
+------------
+* OS: Linux
+* Supported Hardware: Intel Data Center GPU (Intel ARC GPU WIP)
+* OneAPI requirements: oneAPI 2024.1 
+.. _xpu_backend_quick_start_dockerfile:
+Quick start using Dockerfile
+----------------------------
+.. code-block:: console
+    $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+    $ docker run -it \
+                 --rm \
+                 --network=host \
+                 --device /dev/dri \
+                 -v /dev/dri/by-path:/dev/dri/by-path \
+                 vllm-xpu-env
+.. _build_xpu_backend_from_source:
+Build from source
+-----------------
+- First, install required driver and intel OneAPI 2024.1 or later.
+- Second, install Python packages for vLLM XPU backend building:
+.. code-block:: console
+    $ source /opt/intel/oneapi/setvars.sh
+    $ pip install --upgrade pip
+    $ pip install -v -r requirements-xpu.txt 
+- Finally, build and install vLLM XPU backend: 
+.. code-block:: console
+    $ VLLM_TARGET_DEVICE=xpu python setup.py install
+.. note::
+    - FP16 is the default data type in the current XPU backend. The BF16 data
+      type will be supported in the future.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -38,7 +38,7 @@ vLLM is flexible and easy to use with:
 * Seamless integration with popular HuggingFace models
 * High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-* Tensor parallelism support for distributed inference
+* Tensor parallelism and pipeline parallelism support for distributed inference
 * Streaming outputs
 * OpenAI-compatible API server
 * Support NVIDIA GPUs and AMD GPUs
@@ -63,8 +63,11 @@ Documentation
   getting_started/installation
   getting_started/amd-installation
-   getting_started/neuron-installation
+   getting_started/openvino-installation
   getting_started/cpu-installation
+   getting_started/neuron-installation
+   getting_started/tpu-installation
+   getting_started/xpu-installation
   getting_started/quickstart
   getting_started/debugging
   getting_started/examples/examples_index
@@ -80,6 +83,8 @@ Documentation
   serving/env_vars
   serving/usage_stats
   serving/integrations
+   serving/tensorizer
+   serving/faq
 .. toctree::
   :maxdepth: 1
@@ -87,6 +92,7 @@ Documentation
   models/supported_models
   models/adding_model
+   models/enabling_multimodal_inputs
   models/engine_args
   models/lora
   models/vlm
@@ -97,6 +103,7 @@ Documentation
   :maxdepth: 1
   :caption: Quantization
+   quantization/supported_hardware
   quantization/auto_awq
   quantization/fp8
   quantization/fp8_e5m2_kvcache
@@ -110,12 +117,14 @@ Documentation
   automatic_prefix_caching/details
 .. toctree::
+   :maxdepth: 2
   :caption: Developer Documentation
   dev/sampling_params
   dev/offline_inference/offline_index
   dev/engine/engine_index
   dev/kernel/paged_attention
+   dev/input_processing/model_inputs_index
   dev/multimodal/multimodal_index
   dev/dockerfile/dockerfile