Merge tag 'v0.9.1' into v0.9.1-dev

4c676e3d · zhuwenwen · b4c4464d · b6553be1 · b4c4464d · b4c4464d
Commit 4c676e3d authored Jun 20, 2025 by zhuwenwen
20 changed files
--- a/docs/source/api/offline_inference/llm.md
+++ b/docs/source/api/offline_inference/llm.md
-# LLM Class
-```{eval-rst}
-.. autoclass:: vllm.LLM
-    :members:
-    :show-inheritance:
-```
--- a/docs/source/api/offline_inference/llm_inputs.md
+++ b/docs/source/api/offline_inference/llm_inputs.md
-# LLM Inputs
-```{eval-rst}
-.. autodata:: vllm.inputs.PromptType
-```
-```{eval-rst}
-.. autoclass:: vllm.inputs.TextPrompt
-    :show-inheritance:
-    :members:
-    :member-order: bysource
-```
-```{eval-rst}
-.. autoclass:: vllm.inputs.TokensPrompt
-    :show-inheritance:
-    :members:
-    :member-order: bysource
-```
--- a/docs/source/assets/contributing/dockerfile-stages-dependency.png
+++ b/docs/source/assets/contributing/dockerfile-stages-dependency.png
--- a/docs/source/community/blog.md
+++ b/docs/source/community/blog.md
-# vLLM Blog
-vLLM blog posts are published [here](https://blog.vllm.ai/).
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
-# SPDX-License-Identifier: Apache-2.0
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-# -- Path setup --------------------------------------------------------------
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-import datetime
-import inspect
-import logging
-import os
-import sys
-import requests
-from sphinx.ext import autodoc
-logger = logging.getLogger(__name__)
-sys.path.append(os.path.abspath("../.."))
-# -- Project information -----------------------------------------------------
-project = 'vLLM'
-copyright = f'{datetime.datetime.now().year}, vLLM Team'
-author = 'the vLLM Team'
-# -- General configuration ---------------------------------------------------
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    "sphinx.ext.napoleon",
-    "sphinx.ext.linkcode",
-    "sphinx.ext.intersphinx",
-    "sphinx_copybutton",
-    "sphinx.ext.autodoc",
-    "sphinx.ext.autosummary",
-    "myst_parser",
-    "sphinxarg.ext",
-    "sphinx_design",
-    "sphinx_togglebutton",
-]
-myst_enable_extensions = [
-    "colon_fence",
-]
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"]
-# Exclude the prompt "$" when copying code
-copybutton_prompt_text = r"\$ "
-copybutton_prompt_is_regexp = True
-# -- Options for HTML output -------------------------------------------------
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_title = project
-html_theme = 'sphinx_book_theme'
-html_logo = 'assets/logos/vllm-logo-text-light.png'
-html_favicon = 'assets/logos/vllm-logo-only-light.ico'
-html_theme_options = {
-    'path_to_docs': 'docs/source',
-    'repository_url': 'https://github.com/vllm-project/vllm',
-    'use_repository_button': True,
-    'use_edit_page_button': True,
-}
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-html_js_files = ["custom.js"]
-html_css_files = ["custom.css"]
-myst_heading_anchors = 2
-myst_url_schemes = {
-    'http': None,
-    'https': None,
-    'mailto': None,
-    'ftp': None,
-    "gh-issue": {
-        "url":
-        "https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}",
-        "title": "Issue #{{path}}",
-        "classes": ["github"],
-    },
-    "gh-pr": {
-        "url":
-        "https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}",
-        "title": "Pull Request #{{path}}",
-        "classes": ["github"],
-    },
-    "gh-project": {
-        "url": "https://github.com/orgs/vllm-project/projects/{{path}}",
-        "title": "Project #{{path}}",
-        "classes": ["github"],
-    },
-    "gh-dir": {
-        "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}",
-        "title": "{{path}}",
-        "classes": ["github"],
-    },
-    "gh-file": {
-        "url": "https://github.com/vllm-project/vllm/blob/main/{{path}}",
-        "title": "{{path}}",
-        "classes": ["github"],
-    },
-}
-# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
-READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
-if READTHEDOCS_VERSION_TYPE == "tag":
-    # remove the warning banner if the version is a tagged release
-    header_file = os.path.join(os.path.dirname(__file__),
-                               "_templates/sections/header.html")
-    # The file might be removed already if the build is triggered multiple times
-    # (readthedocs build both HTML and PDF versions separately)
-    if os.path.exists(header_file):
-        os.remove(header_file)
-# Generate additional rst documentation here.
-def setup(app):
-    from docs.source.generate_examples import generate_examples
-    generate_examples()
-_cached_base: str = ""
-_cached_branch: str = ""
-def get_repo_base_and_branch(pr_number):
-    global _cached_base, _cached_branch
-    if _cached_base and _cached_branch:
-        return _cached_base, _cached_branch
-    url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}"
-    response = requests.get(url)
-    if response.status_code == 200:
-        data = response.json()
-        _cached_base = data['head']['repo']['full_name']
-        _cached_branch = data['head']['ref']
-        return _cached_base, _cached_branch
-    else:
-        logger.error("Failed to fetch PR details: %s", response)
-        return None, None
-def linkcode_resolve(domain, info):
-    if domain != 'py':
-        return None
-    if not info['module']:
-        return None
-    filename = info['module'].replace('.', '/')
-    module = info['module']
-    # try to determine the correct file and line number to link to
-    obj = sys.modules[module]
-    # get as specific as we can
-    lineno: int = 0
-    filename: str = ""
-    try:
-        for part in info['fullname'].split('.'):
-            obj = getattr(obj, part)
-            # Skip decorator wrappers by checking if the object is a function
-            # and has a __wrapped__ attribute (which decorators typically set)
-            while hasattr(obj, '__wrapped__'):
-                obj = obj.__wrapped__
-            if not (inspect.isclass(obj) or inspect.isfunction(obj)
-                    or inspect.ismethod(obj)):
-                obj = obj.__class__  # Get the class of the instance
-            lineno = inspect.getsourcelines(obj)[1]
-            filename = (inspect.getsourcefile(obj)
-                        or f"{filename}.py").split("vllm/", 1)[1]
-    except Exception:
-        # For some things, like a class member, won't work, so
-        # we'll use the line number of the parent (the class)
-        pass
-    if filename.startswith("checkouts/"):
-        # a PR build on readthedocs
-        pr_number = filename.split("/")[1]
-        filename = filename.split("/", 2)[2]
-        base, branch = get_repo_base_and_branch(pr_number)
-        if base and branch:
-            return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
-    # Otherwise, link to the source file on the main branch
-    return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
-# Mock out external dependencies here, otherwise the autodoc pages may be blank.
-autodoc_mock_imports = [
-    "blake3",
-    "compressed_tensors",
-    "cpuinfo",
-    "cv2",
-    "torch",
-    "transformers",
-    "psutil",
-    "prometheus_client",
-    "sentencepiece",
-    "vllm._C",
-    "PIL",
-    "numpy",
-    'triton',
-    "tqdm",
-    "tensorizer",
-    "pynvml",
-    "outlines",
-    "xgrammar",
-    "librosa",
-    "soundfile",
-    "gguf",
-    "lark",
-    "decord",
-]
-for mock_target in autodoc_mock_imports:
-    if mock_target in sys.modules:
-        logger.info(
-            "Potentially problematic mock target (%s) found; "
-            "autodoc_mock_imports cannot mock modules that have already "
-            "been loaded into sys.modules when the sphinx build starts.",
-            mock_target)
-class MockedClassDocumenter(autodoc.ClassDocumenter):
-    """Remove note about base class when a class is derived from object."""
-    def add_line(self, line: str, source: str, *lineno: int) -> None:
-        if line == "   Bases: :py:class:`object`":
-            return
-        super().add_line(line, source, *lineno)
-autodoc.ClassDocumenter = MockedClassDocumenter
-intersphinx_mapping = {
-    "python": ("https://docs.python.org/3", None),
-    "typing_extensions":
-    ("https://typing-extensions.readthedocs.io/en/latest", None),
-    "aiohttp": ("https://docs.aiohttp.org/en/stable", None),
-    "pillow": ("https://pillow.readthedocs.io/en/stable", None),
-    "numpy": ("https://numpy.org/doc/stable", None),
-    "torch": ("https://pytorch.org/docs/stable", None),
-    "psutil": ("https://psutil.readthedocs.io/en/stable", None),
-}
-autodoc_preserve_defaults = True
-autodoc_warningiserror = True
-navigation_with_keys = False
--- a/docs/source/contributing/model/index.md
+++ b/docs/source/contributing/model/index.md
-(new-model)=
-# Adding a New Model
-This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
-:::{toctree}
-:caption: Contents
-:maxdepth: 1
-basic
-registration
-tests
-multimodal
-:::
-:::{note}
-The complexity of adding a new model depends heavily on the model's architecture.
-The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
-However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
-:::
-:::{tip}
-If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
-or ask on our [developer slack](https://slack.vllm.ai).
-We will be happy to help you out!
-:::
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
-(supports-multimodal)=
-# Multi-Modal Support
-This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).
-## 1. Update the base vLLM model
-It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic).
-Further update the model as follows:
- Reserve a keyword parameter in {meth}`~torch.nn.Module.forward` for each input tensor that corresponds to a multi-modal input, as shown in the following example:
-  ```diff
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-  +     pixel_values: torch.Tensor,
-    ) -> SamplerOutput:
-  ```
-  More conveniently, you can simply pass `**kwargs` to the {meth}`~torch.nn.Module.forward` method and retrieve the keyword parameters for multimodal inputs from it.
- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings` that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
-    ```python
-    class YourModelForImage2Seq(nn.Module):
-        ...
-        def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
-            assert self.vision_encoder is not None
-            image_features = self.vision_encoder(image_input)
-            return self.multi_modal_projector(image_features)
-        def get_multimodal_embeddings(
-                self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-            # Validate the multimodal input keyword arguments
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            if image_input is None:
-                return None
-            # Run multimodal inputs through encoder and projector
-            vision_embeddings = self._process_image_input(image_input)
-            return vision_embeddings
-    ```
-    :::{important}
-    The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
-    :::
- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
-    ```python
-    from .utils import merge_multimodal_embeddings
-    class YourModelForImage2Seq(nn.Module):
-        ...
-        def get_input_embeddings(
-            self,
-            input_ids: torch.Tensor,
-            multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-        ) -> torch.Tensor:
-            # `get_input_embeddings` should already be implemented for the language 
-            # model as one of the requirements of basic vLLM model implementation.
-            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-            if multimodal_embeddings is not None:
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids=input_ids, 
-                    inputs_embeds=inputs_embeds, 
-                    multimodal_embeddings=multimodal_embeddings,
-                    placeholder_token_id=self.config.image_token_index)
-            return inputs_embeds
-    ```
- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model.
-    ```python
-    class YourModelForImage2Seq(nn.Module):
-        ...
-        def get_language_model(self) -> torch.nn.Module:
-            # Change `language_model` according to your implementation.
-            return self.language_model
-    ```
- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
-  ```diff
-  + from vllm.model_executor.models.interfaces import SupportsMultiModal
-  - class YourModelForImage2Seq(nn.Module):
-  + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-  ```
-  :::{note}
-  The model class does not have to be named {code}`*ForCausalLM`.
-  Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
-  :::
-## 2. Specify processing information
-Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo`
-to provide basic information related to HF processing.
-### Maximum number of input items
-You need to override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits`
-to return the maximum number of input items for each modality supported by the model.
-For example, if the model supports any number of images but only one video per prompt:
-```python
-def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-    return {"image": None, "video": 1}
-```
-## 3. Specify dummy inputs
-Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
-HF processing as well as memory profiling.
-### For memory profiling
-Override the abstract methods {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text` and {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data` to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
-Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
-::::{tab-set}
-:::{tab-item} Basic example: LLaVA
-:sync: llava
-Looking at the code of HF's `LlavaForConditionalGeneration`:
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
-n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
-n_image_features = image_features.shape[0] * image_features.shape[1]
-if n_image_tokens != n_image_features:
-    raise ValueError(
-        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-    )
-special_image_mask = (
-    (input_ids == self.config.image_token_index)
-    .unsqueeze(-1)
-    .expand_as(inputs_embeds)
-    .to(inputs_embeds.device)
-)
-image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-```
-The number of placeholder feature tokens per image is `image_features.shape[1]`.
-`image_features` is calculated inside the `get_image_features` method:
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
-image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
-if vision_feature_select_strategy == "default":
-    selected_image_feature = selected_image_feature[:, 1:]
-elif vision_feature_select_strategy == "full":
-    selected_image_feature = selected_image_feature
-else:
-    raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
-image_features = self.multi_modal_projector(selected_image_feature)
-return image_features
-```
-We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
-(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
-Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`.
-The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention
-mechanism doesn't change the sequence length of the output hidden states.
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102
-hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
-hidden_states = self.pre_layrnorm(hidden_states)
-encoder_outputs = self.encoder(
-    inputs_embeds=hidden_states,
-    output_attentions=output_attentions,
-    output_hidden_states=output_hidden_states,
-    return_dict=return_dict,
-)
-```
-To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
-target_dtype = self.patch_embedding.weight.dtype
-patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
-patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
-class_embeds = self.class_embedding.expand(batch_size, 1, -1)
-embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
-if interpolate_pos_encoding:
-    embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
-else:
-    embeddings = embeddings + self.position_embedding(self.position_ids)
-return embeddings
-```
-We can infer that `embeddings.shape[1] == self.num_positions`, where
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196
-self.num_patches = (self.image_size // self.patch_size) ** 2
-self.num_positions = self.num_patches + 1
-```
-Overall, the number of placeholder feature tokens for an image can be calculated as:
-```python
-def get_num_image_tokens(
-    self,
-    *,
-    image_width: int,
-    image_height: int,
-) -> int:
-    hf_config = self.get_hf_config()
-    hf_processor = self.get_hf_processor()
-    image_size = hf_config.vision_config.image_size
-    patch_size = hf_config.vision_config.patch_size
-    num_image_tokens = (image_size // patch_size) ** 2 + 1
-    if hf_processor.vision_feature_select_strategy == "default":
-        num_image_tokens -= 1
-    return num_image_tokens
-```
-Notice that the number of image tokens doesn't depend on the image width and height.
-We can simply use a dummy `image_size` to calculate the multimodal profiling data:
-```python
-# NOTE: In actuality, this is usually implemented as part of the
-# model's subclass of `BaseProcessingInfo`, but we show it as is
-# here for simplicity.
-def get_image_size_with_most_features(self) -> ImageSize:
-    hf_config = self.get_hf_config()
-    width = height = hf_config.image_size
-    return ImageSize(width=width, height=height)
-def get_dummy_mm_data(
-    self,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-) -> MultiModalDataDict:
-    num_images = mm_counts.get("image", 0)
-    target_width, target_height = \
-        self.info.get_image_size_with_most_features()
-    return {
-        "image":
-        self._get_dummy_images(width=target_width,
-                               height=target_height,
-                               num_images=num_images)
-    }
-```
-For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
-```python
-def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
-    num_images = mm_counts.get("image", 0)
-    processor = self.info.get_hf_processor()
-    image_token = processor.image_token
-    return image_token * num_images
-```
-:::
-:::{tab-item} No input placeholders: Fuyu
-:sync: fuyu
-Looking at the code of HF's `FuyuForCausalLM`:
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
-if image_patches is not None and past_key_values is None:
-    patch_embeddings = [
-        self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
-        .squeeze(0)
-        .to(inputs_embeds.device)
-        for patch in image_patches
-    ]
-    inputs_embeds = self.gather_continuous_embeddings(
-        word_embeddings=inputs_embeds,
-        continuous_embeddings=patch_embeddings,
-        image_patch_input_indices=image_patches_indices,
-    )
-```
-The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
-which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
-Unlike LLaVA, Fuyu does not define the number of patches inside the modeling file. Where can we get more information?
-Considering that the model input comes from the output of `FuyuProcessor`, let's **look at the preprocessing files**.
-The image outputs are obtained by calling `FuyuImageProcessor.preprocess` and then
-`FuyuImageProcessor.preprocess_with_tokenizer_info` inside `FuyuProcessor`.
-In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
-returning the dimensions after resizing (but before padding) as metadata.
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
-image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
-batch_images = image_encoding["images"]
-image_unpadded_heights = image_encoding["image_unpadded_heights"]
-image_unpadded_widths = image_encoding["image_unpadded_widths"]
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
-if do_resize:
-    batch_images = [
-        [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
-        for images in batch_images
-    ]
-image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
-image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
-image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
-if do_pad:
-    batch_images = [
-        [
-            self.pad_image(
-                image,
-                size=size,
-                mode=padding_mode,
-                constant_values=padding_value,
-                input_data_format=input_data_format,
-            )
-            for image in images
-        ]
-        for images in batch_images
-    ]
-```
-In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
-model_image_input = self.image_processor.preprocess_with_tokenizer_info(
-    image_input=tensor_batch_images,
-    image_present=image_present,
-    image_unpadded_h=image_unpadded_heights,
-    image_unpadded_w=image_unpadded_widths,
-    image_placeholder_id=image_placeholder_id,
-    image_newline_id=image_newline_id,
-    variable_sized=True,
-)
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
-image_height, image_width = image.shape[1], image.shape[2]
-if variable_sized:  # variable_sized=True
-    new_h = min(
-        image_height,
-        math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
-    )
-    new_w = min(
-        image_width,
-        math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
-    )
-    image = image[:, :new_h, :new_w]
-    image_height, image_width = new_h, new_w
-num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
-tensor_of_image_ids = torch.full(
-    [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
-)
-patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
-assert num_patches == patches.shape[0]
-```
-The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
-patch_size = patch_size if patch_size is not None else self.patch_size
-patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
-if image_height % patch_height != 0:
-    raise ValueError(f"{image_height=} must be divisible by {patch_height}")
-if image_width % patch_width != 0:
-    raise ValueError(f"{image_width=} must be divisible by {patch_width}")
-num_patches_per_dim_h = image_height // patch_height
-num_patches_per_dim_w = image_width // patch_width
-num_patches = num_patches_per_dim_h * num_patches_per_dim_w
-```
-These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
-to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
-```python
-def get_image_size_with_most_features(self) -> ImageSize:
-    image_processor = self.get_image_processor()
-    return ImageSize(width=image_processor.size["width"],
-                        height=image_processor.size["height"])
-```
-Fuyu does not expect image placeholders in the inputs to HF processor, so
-the dummy prompt text is empty regardless of the number of images.
-```python
-def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
-    return ""
-```
-For the multimodal image profiling data, the logic is very similar to LLaVA:
-```python
-def get_dummy_mm_data(
-    self,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-) -> MultiModalDataDict:
-    target_width, target_height = \
-        self.info.get_image_size_with_most_features()
-    num_images = mm_counts.get("image", 0)
-    return {
-        "image":
-        self._get_dummy_images(width=target_width,
-                               height=target_height,
-                               num_images=num_images)
-    }
-```
-:::
-::::
-## 4. Specify processing details
-Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`
-to fill in the missing details about HF processing.
-:::{seealso}
-[Multi-Modal Data Processing](#mm-processing)
-:::
-### Multi-modal fields
-Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
-return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
-:::::{tab-set}
-::::{tab-item} Basic example: LLaVA
-:sync: llava
-The output of `CLIPImageProcessor` is a simple tensor with shape
-`(num_images, num_channels, image_height, image_width)`:
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
-images = [
-    to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
-    for image in all_images
-]
-data = {"pixel_values": images}
-return BatchFeature(data=data, tensor_type=return_tensors)
-```
-So, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` as follows:
-```python
-def _get_mm_fields_config(
-    self,
-    hf_inputs: BatchFeature,
-    hf_processor_mm_kwargs: Mapping[str, object],
-) -> Mapping[str, MultiModalFieldConfig]:
-    return dict(
-        pixel_values=MultiModalFieldConfig.batched("image"),
-    )
-```
-:::{note}
-Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
-pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
-:::
-::::
-::::{tab-item} With postprocessing: Fuyu
-:sync: fuyu
-The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates
-the patches from each image belonging to an item in the batch:
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679
-        image_input_ids.append(tensor_of_image_ids)
-        image_patches.append(patches)
-    else:
-        image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
-batch_image_input_ids.append(image_input_ids)
-batch_image_patches.append(image_patches)
-```
-The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore
-`(1, num_images, num_patches, patch_width * patch_height * num_channels)`.
-In order to support the use of {func}`MultiModalFieldConfig.batched` like in LLaVA,
-we remove the extra batch dimension by overriding {meth}`BaseMultiModalProcessor._call_hf_processor`:
-```python
-def _call_hf_processor(
-    self,
-    prompt: str,
-    mm_data: Mapping[str, object],
-    mm_kwargs: Mapping[str, object],
-) -> BatchFeature:
-    processed_outputs = super()._call_hf_processor(
-        prompt=prompt,
-        mm_data=mm_data,
-        mm_kwargs=mm_kwargs,
-    )
-    image_patches = processed_outputs.get("image_patches")
-    if image_patches is not None:
-        images = mm_data["images"]
-        assert isinstance(images, list)
-        # Original output: (1, num_images, Pn, Px * Py * C)
-        # New output: (num_images, Pn, Px * Py * C)
-        assert (isinstance(image_patches, list)
-                and len(image_patches) == 1)
-        assert (isinstance(image_patches[0], torch.Tensor)
-                and len(image_patches[0]) == len(images))
-        processed_outputs["image_patches"] = image_patches[0]
-    return processed_outputs
-```
-:::{note}
-Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
-for text-only inputs to prevent unnecessary warnings from HF processor.
-:::
-This lets us override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` as follows:
-```python
-def _get_mm_fields_config(
-    self,
-    hf_inputs: BatchFeature,
-    hf_processor_mm_kwargs: Mapping[str, object],
-) -> Mapping[str, MultiModalFieldConfig]:
-    return dict(image_patches=MultiModalFieldConfig.batched("image"))
-```
-::::
-:::::
-### Prompt updates
-Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` to
-return a list of {class}`~vllm.multimodal.processing.PromptUpdate` instances.
-Each {class}`~vllm.multimodal.processing.PromptUpdate` instance specifies an update operation
-(e.g.: insertion, replacement) performed by the HF processor.
-::::{tab-set}
-:::{tab-item} Basic example: LLaVA
-:sync: llava
-Looking at HF's `LlavaProcessor`:
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170
-prompt_strings = []
-for sample in text:
-    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
-    prompt_strings.append(sample)
-```
-It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
-Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` as follows:
-```python
-def _get_prompt_updates(
-    self,
-    mm_items: MultiModalDataItems,
-    hf_processor_mm_kwargs: Mapping[str, object],
-    out_mm_kwargs: MultiModalKwargs,
-) -> Sequence[PromptUpdate]:
-    hf_config = self.info.get_hf_config()
-    image_token_id = hf_config.image_token_index
-    def get_replacement(item_idx: int):
-        images = mm_items.get_items("image", ImageProcessorItems)
-        image_size = images.get_image_size(item_idx)
-        num_image_tokens = self.info.get_num_image_tokens(
-            image_width=image_size.width,
-            image_height=image_size.height,
-        )
-        return [image_token_id] * num_image_tokens
-    return [
-        PromptReplacement(
-            modality="image",
-            target=[image_token_id],
-            replacement=get_replacement,
-        ),
-    ]
-```
-:::
-:::{tab-item} Handling additional tokens: Fuyu
-:sync: fuyu
-Recall the layout of feature tokens from Step 2:
-```
-|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
-|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
-...
-|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
-```
-We define a helper function to return `ncols` and `nrows` directly:
-```python
-def get_image_feature_grid_size(
-    self,
-    *,
-    image_width: int,
-    image_height: int,
-) -> tuple[int, int]:
-    image_processor = self.get_image_processor()
-    target_width = image_processor.size["width"]
-    target_height = image_processor.size["height"]
-    patch_width = image_processor.patch_size["width"]
-    patch_height = image_processor.patch_size["height"]
-    if not (image_width <= target_width and image_height <= target_height):
-        height_scale_factor = target_height / image_height
-        width_scale_factor = target_width / image_width
-        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
-        image_height = int(image_height * optimal_scale_factor)
-        image_width = int(image_width * optimal_scale_factor)
-    ncols = math.ceil(image_width / patch_width)
-    nrows = math.ceil(image_height / patch_height)
-    return ncols, nrows
-```
-Based on this, we can initially define our replacement tokens as:
-```python
-def get_replacement(item_idx: int):
-    images = mm_items.get_items("image", ImageProcessorItems)
-    image_size = images.get_image_size(item_idx)
-    ncols, nrows = self.info.get_image_feature_grid_size(
-        image_width=image_size.width,
-        image_height=image_size.height,
-    )
-    # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
-    # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
-    return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
-```
-However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
-a BOS token (`<s>`) is also added to the promopt:
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
-model_image_input = self.image_processor.preprocess_with_tokenizer_info(
-    image_input=tensor_batch_images,
-    image_present=image_present,
-    image_unpadded_h=image_unpadded_heights,
-    image_unpadded_w=image_unpadded_widths,
-    image_placeholder_id=image_placeholder_id,
-    image_newline_id=image_newline_id,
-    variable_sized=True,
-)
-prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
-    tokenizer=self.tokenizer,
-    prompts=prompts,
-    scale_factors=scale_factors,
-    max_tokens_to_generate=self.max_tokens_to_generate,
-    max_position_embeddings=self.max_position_embeddings,
-    add_BOS=True,
-    add_beginning_of_answer_token=True,
-)
-```
-To assign the vision embeddings to only the image tokens, instead of a string
-you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
-```python
-hf_config = self.info.get_hf_config()
-bos_token_id = hf_config.bos_token_id  # `<s>`
-assert isinstance(bos_token_id, int)
-def get_replacement_fuyu(item_idx: int):
-    images = mm_items.get_items("image", ImageProcessorItems)
-    image_size = images.get_image_size(item_idx)
-    ncols, nrows = self.info.get_image_feature_grid_size(
-        image_width=image_size.width,
-        image_height=image_size.height,
-    )
-    image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                    [_NEWLINE_TOKEN_ID]) * nrows
-    return PromptUpdateDetails.select_token_id(
-        image_tokens + [bos_token_id],
-        embed_token_id=_IMAGE_TOKEN_ID,
-    )
-```
-Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
-we can search for it to conduct the replacement at the start of the string:
-```python
-def _get_prompt_updates(
-    self,
-    mm_items: MultiModalDataItems,
-    hf_processor_mm_kwargs: Mapping[str, object],
-    out_mm_kwargs: MultiModalKwargs,
-) -> Sequence[PromptUpdate]:
-    hf_config = self.info.get_hf_config()
-    bos_token_id = hf_config.bos_token_id
-    assert isinstance(bos_token_id, int)
-    tokenizer = self.info.get_tokenizer()
-    eot_token_id = tokenizer.bos_token_id
-    assert isinstance(eot_token_id, int)
-    def get_replacement_fuyu(item_idx: int):
-        images = mm_items.get_items("image", ImageProcessorItems)
-        image_size = images.get_image_size(item_idx)
-        ncols, nrows = self.info.get_image_feature_grid_size(
-            image_width=image_size.width,
-            image_height=image_size.height,
-        )
-        image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                        [_NEWLINE_TOKEN_ID]) * nrows
-        return PromptUpdateDetails.select_token_id(
-            image_tokens + [bos_token_id],
-            embed_token_id=_IMAGE_TOKEN_ID,
-        )
-    return [
-        PromptReplacement(
-            modality="image",
-            target=[eot_token_id],
-            replacement=get_replacement_fuyu,
-        )
-    ]
-```
-:::
-::::
-## 5. Register processor-related classes
-After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2),
-{class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` (Step 3),
-and {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (Step 4),
-decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>`
-to register them to the multi-modal registry:
-```diff
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-+ from vllm.multimodal import MULTIMODAL_REGISTRY
-+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
-+                                         info=YourProcessingInfo,
-+                                         dummy_inputs=YourDummyInputsBuilder)
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-```
-## Notes
-### Inserting feature tokens without replacement
-Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use {class}`~vllm.multimodal.processing.PromptInsertion` instead of {class}`~vllm.multimodal.processing.PromptReplacement` inside {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`.
-Examples:
- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
-### Handling prompt updates unrelated to multi-modal data
-{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only` so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design](#mm-processing).
-Examples:
- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
-### Custom HF processor
-Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor`.
-Examples:
- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
--- a/docs/source/deployment/frameworks/helm.md
+++ b/docs/source/deployment/frameworks/helm.md
-(deployment-helm)=
-# Helm
-A Helm chart to deploy vLLM for Kubernetes
-Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
-This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
-## Prerequisites
-Before you begin, ensure that you have the following:
- A running Kubernetes cluster
- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
- Available GPU resources in your cluster
- S3 with the model which will be deployed
-## Installing the chart
-To install the chart with the release name `test-vllm`:
-```console
-helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
-```
-## Uninstalling the Chart
-To uninstall the `test-vllm` deployment:
-```console
-helm uninstall test-vllm --namespace=ns-vllm
-```
-The command removes all the Kubernetes components associated with the
-chart **including persistent volumes** and deletes the release.
-## Architecture
-:::{image} /assets/deployment/architecture_helm_deployment.png
-:::
-## Values
-:::{list-table}
-:widths: 25 25 25 25
-:header-rows: 1
- * Key
-  * Type
-  * Default
-  * Description
- * autoscaling
-  * object
-  * {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
-  * Autoscaling configuration
- * autoscaling.enabled
-  * bool
-  * false
-  * Enable autoscaling
- * autoscaling.maxReplicas
-  * int
-  * 100
-  * Maximum replicas
- * autoscaling.minReplicas
-  * int
-  * 1
-  * Minimum replicas
- * autoscaling.targetCPUUtilizationPercentage
-  * int
-  * 80
-  * Target CPU utilization for autoscaling
- * configs
-  * object
-  * {}
-  * Configmap
- * containerPort
-  * int
-  * 8000
-  * Container port
- * customObjects
-  * list
-  * []
-  * Custom Objects configuration
- * deploymentStrategy
-  * object
-  * {}
-  * Deployment strategy configuration
- * externalConfigs
-  * list
-  * []
-  * External configuration
- * extraContainers
-  * list
-  * []
-  * Additional containers configuration
- * extraInit
-  * object
-  * {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
-  * Additional configuration for the init container
- * extraInit.pvcStorage
-  * string
-  * "50Gi"
-  * Storage size of the s3
- * extraInit.s3modelpath
-  * string
-  * "relative_s3_model_path/opt-125m"
-  * Path of the model on the s3 which hosts model weights and config files
- * extraInit.awsEc2MetadataDisabled
-  * boolean
-  * true
-  * Disables the use of the Amazon EC2 instance metadata service
- * extraPorts
-  * list
-  * []
-  * Additional ports configuration
- * gpuModels
-  * list
-  * ["TYPE_GPU_USED"]
-  * Type of gpu used
- * image
-  * object
-  * {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
-  * Image configuration
- * image.command
-  * list
-  * ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
-  * Container launch command
- * image.repository
-  * string
-  * "vllm/vllm-openai"
-  * Image repository
- * image.tag
-  * string
-  * "latest"
-  * Image tag
- * livenessProbe
-  * object
-  * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
-  * Liveness probe configuration
- * livenessProbe.failureThreshold
-  * int
-  * 3
-  * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
- * livenessProbe.httpGet
-  * object
-  * {"path":"/health","port":8000}
-  * Configuration of the Kubelet http request on the server
- * livenessProbe.httpGet.path
-  * string
-  * "/health"
-  * Path to access on the HTTP server
- * livenessProbe.httpGet.port
-  * int
-  * 8000
-  * Name or number of the port to access on the container, on which the server is listening
- * livenessProbe.initialDelaySeconds
-  * int
-  * 15
-  * Number of seconds after the container has started before liveness probe is initiated
- * livenessProbe.periodSeconds
-  * int
-  * 10
-  * How often (in seconds) to perform the liveness probe
- * maxUnavailablePodDisruptionBudget
-  * string
-  * ""
-  * Disruption Budget Configuration
- * readinessProbe
-  * object
-  * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
-  * Readiness probe configuration
- * readinessProbe.failureThreshold
-  * int
-  * 3
-  * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
- * readinessProbe.httpGet
-  * object
-  * {"path":"/health","port":8000}
-  * Configuration of the Kubelet http request on the server
- * readinessProbe.httpGet.path
-  * string
-  * "/health"
-  * Path to access on the HTTP server
- * readinessProbe.httpGet.port
-  * int
-  * 8000
-  * Name or number of the port to access on the container, on which the server is listening
- * readinessProbe.initialDelaySeconds
-  * int
-  * 5
-  * Number of seconds after the container has started before readiness probe is initiated
- * readinessProbe.periodSeconds
-  * int
-  * 5
-  * How often (in seconds) to perform the readiness probe
- * replicaCount
-  * int
-  * 1
-  * Number of replicas
- * resources
-  * object
-  * {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
-  * Resource configuration
- * resources.limits."nvidia.com/gpu"
-  * int
-  * 1
-  * Number of gpus used
- * resources.limits.cpu
-  * int
-  * 4
-  * Number of CPUs
- * resources.limits.memory
-  * string
-  * "16Gi"
-  * CPU memory configuration
- * resources.requests."nvidia.com/gpu"
-  * int
-  * 1
-  * Number of gpus used
- * resources.requests.cpu
-  * int
-  * 4
-  * Number of CPUs
- * resources.requests.memory
-  * string
-  * "16Gi"
-  * CPU memory configuration
- * secrets
-  * object
-  * {}
-  * Secrets configuration
- * serviceName
-  * string
-  *
-  * Service name
- * servicePort
-  * int
-  * 80
-  * Service port
- * labels.environment
-  * string
-  * test
-  * Environment name
- * labels.release
-  * string
-  * test
-  * Release name
-:::
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
-# Using other frameworks
-:::{toctree}
-:maxdepth: 1
-anything-llm
-bentoml
-cerebrium
-dstack
-helm
-lws
-modal
-open-webui
-skypilot
-triton
-:::
--- a/docs/source/deployment/integrations/index.md
+++ b/docs/source/deployment/integrations/index.md
-# External Integrations
-:::{toctree}
-:maxdepth: 1
-kserve
-kubeai
-llamastack
-llmaz
-production-stack
-:::
--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
-(design-paged-attention)=
-# vLLM Paged Attention
- Currently, vLLM utilizes its own implementation of a multi-head query
-  attention kernel (`csrc/attention/attention_kernels.cu`).
-  This kernel is designed to be compatible with
-  vLLM's paged KV caches, where the key and value cache are stored in
-  separate blocks (note that this block concept differs from the GPU
-  thread block. So in a later document, I will refer to vLLM paged
-  attention block as "block", while refer to GPU thread block as
-  "thread block").
- To achieve high performance, this kernel relies on a specially
-  designed memory layout and access method, specifically when threads
-  read data from global memory to shared memory. The purpose of this
-  document is to provide a high-level explanation of the kernel
-  implementation step by step, aiding those who wish to learn about the
-  vLLM multi-head query attention kernel. After going through this
-  document, users will likely have a better understanding and feel easier
-  to follow the actual implementation.
- Please note that this document may not cover all details, such as how
-  to calculate the correct index for the corresponding data or the dot
-  multiplication implementation. However, after reading this document
-  and becoming familiar with the high-level logic flow, it should be
-  easier for you to read the actual code and understand the details.
-## Inputs
- The kernel function takes a list of arguments for the current thread
-  to perform its assigned work. The three most important arguments are
-  the input pointers `q`, `k_cache`, and `v_cache`, which point
-  to query, key, and value data on global memory that need to be read
-  and processed. The output pointer `out` points to global memory
-  where the result should be written. These four pointers actually
-  refer to multi-dimensional arrays, but each thread only accesses the
-  portion of data assigned to it. I have omitted all other runtime
-  parameters here for simplicity.
-  ```cpp
-  template<
-  typename scalar_t,
-  int HEAD_SIZE,
-  int BLOCK_SIZE,
-  int NUM_THREADS,
-  int PARTITION_SIZE = 0>
-  __device__ void paged_attention_kernel(
-  ... // Other side args.
-  const scalar_t* __restrict__ out,       // [num_seqs, num_heads, max_num_partitions, head_size]
-  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
-  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
-  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
-  ... // Other side args.
-  )
-  ```
- There are also a list of template arguments above the function
-  signature that are determined during compilation time. `scalar_t`
-  represents the data type of the query, key, and value data elements,
-  such as FP16. `HEAD_SIZE` indicates the number of elements in each
-  head. `BLOCK_SIZE` refers to the number of tokens in each block.
-  `NUM_THREADS` denotes the number of threads in each thread block.
-  `PARTITION_SIZE` represents the number of tensor parallel GPUs (For
-  simplicity, we assume this is 0 and tensor parallel is disabled).
- With these arguments, we need to perform a sequence of preparations.
-  This includes calculating the current head index, block index, and
-  other necessary variables. However, for now, we can ignore these
-  preparations and proceed directly to the actual calculations. It will
-  be easier to understand them once we grasp the entire flow.
-## Concepts
- Just before we dive into the calculation flow, I want to describe a
-  few concepts that are needed for later sections. However, you may
-  skip this section and return later if you encounter any confusing
-  terminologies.
- **Sequence**: A sequence represents a client request. For example,
-  the data pointed to by `q` has a shape of
-  `[num_seqs, num_heads, head_size]`. That represents there are total
-  `num_seqs` of query sequence data are pointed by `q`. Since this
-  kernel is a single query attention kernel, each sequence only has one
-  query token. Hence, the `num_seqs` equals the total number of tokens
-  that are processed in the batch.
- **Context**: The context consists of the generated tokens from the
-  sequence. For instance, `["What", "is", "your"]` are the context
-  tokens, and the input query token is `"name"`. The model might
-  generate the token `"?"`.
- **Vec**: The vec is a list of elements that are fetched and
-  calculated together. For query and key data, the vec size
-  (`VEC_SIZE`) is determined so that each thread group can fetch and
-  calculate 16 bytes of data at a time. For value data, the vec size
-  (`V_VEC_SIZE`) is determined so that each thread can fetch and
-  calculate 16 bytes of data at a time. For example, if the
-  `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the
-  `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8.
- **Thread group**: The thread group is a small group of
-  threads(`THREAD_GROUP_SIZE`) that fetches and calculates one
-  query token and one key token at a time. Each thread handles only a
-  portion of the token data. The total number of elements processed by
-  one thread group is referred as `x`. For example, if the thread
-  group contains 2 threads and the head size is 8, then thread 0
-  handles the query and key elements at index 0, 2, 4, 6, while thread
-  1 handles the elements at index 1, 3, 5, 7.
- **Block**: The key and value cache data in vLLM are split into
-  blocks. Each block stores data for a fixed number(`BLOCK_SIZE`)
-  of tokens at one head. Each block may contain only a portion of the
-  whole context tokens. For example, if the block size is 16 and the
-  head size is 128, then for one head, one block can store 16 * 128 =
-  2048 elements.
- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that
-  execute simultaneously on a stream multiprocessor (SM). In this
-  kernel, each warp processes the calculation between one query token
-  and key tokens of one entire block at a time (it may process multiple
-  blocks in multiple iterations). For example, if there are 4 warps and
-  6 blocks for one context, the assignment would be like warp 0 handles
-  the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
-  handles the 2nd block and warp 3 handles the 3rd block.
- **Thread block**: A thread block is a group of
-  threads(`NUM_THREADS`) that can access the same shared memory.
-  Each thread block contains multiple warps(`NUM_WARPS`), and in
-  this kernel, each thread block processes the calculation between one
-  query token and key tokens of a whole context.
- **Grid**: A grid is a collection of thread blocks and defines the
-  shape of the collection. In this kernel, the shape is
-  `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread
-  block only handles the calculation for one head, one sequence, and
-  one partition.
-## Query
- This section will introduce how query data is stored in memory and
-  fetched by each thread. As mentioned above, each thread group fetches
-  one query token data, while each thread itself only handles a part of
-  one query token data. Within each warp, every thread group will fetch
-  the same query token data, but will multiply it with different key
-  token data.
-  ```cpp
-  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-  ```
-  :::{figure} ../../assets/kernel/query.png
-  :align: center
-  :alt: query
-  :width: 70%
-  Query data of one token at one head
-  :::
- Each thread defines its own `q_ptr` which points to the assigned
-  query token data on global memory. For example, if `VEC_SIZE` is 4
-  and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
-  total of 128 elements divided into 128 / 4 = 32 vecs.
-  :::{figure} ../../assets/kernel/q_vecs.png
-  :align: center
-  :alt: q_vecs
-  :width: 70%
-  `q_vecs` for one thread group
-  :::
-  ```cpp
-  __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
-  ```
- Next, we need to read the global memory data pointed to by `q_ptr`
-  into shared memory as `q_vecs`. It is important to note that each
-  vecs is assigned to a different row. For example, if the
-  `THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs,
-  while thread 1 handles the 1st row vecs. By reading the query data in
-  this way, neighboring threads like thread 0 and thread 1 can read
-  neighbor memory, achieving the memory coalescing to improve
-  performance.
-## Key
- Similar to the "Query" section, this section introduces memory layout
-  and assignment for keys. While each thread group only handle one
-  query token one kernel run, it may handle multiple key tokens across
-  multiple iterations. Meanwhile, each warp will process multiple blocks
-  of key tokens in multiple iterations, ensuring that all context
-  tokens are processed by the entire thread group after the kernel run.
-  In this context, "handle" refers to performing the dot multiplication
-  between query data and key data.
-  ```cpp
-  const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
-                      + kv_head_idx * kv_head_stride
-                      + physical_block_offset * x;
-  ```
- Unlike to `q_ptr`, `k_ptr` in each thread will point to different
-  key token at different iterations. As shown above, that `k_ptr`
-  points to key token data based on `k_cache` at assigned block,
-  assigned head and assigned token.
-  :::{figure} ../../assets/kernel/key.png
-  :align: center
-  :alt: key
-  :width: 70%
-  Key data of all context tokens at one head
-  :::
- The diagram above illustrates the memory layout for key data. It
-  assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
-  8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each
-  rectangle represents all the elements for one key token at one head,
-  which will be processed by one thread group. The left half shows the
-  total 16 blocks of key token data for warp 0, while the right half
-  represents the remaining key token data for other warps or
-  iterations. Inside each rectangle, there are a total 32 vecs (128
-  elements for one token) that will be processed by 2 threads (one
-  thread group) separately.
-  :::{figure} ../../assets/kernel/k_vecs.png
-  :align: center
-  :alt: k_vecs
-  :width: 70%
-  `k_vecs` for one thread
-  :::
-  ```cpp
-  K_vec k_vecs[NUM_VECS_PER_THREAD]
-  ```
- Next, we need to read the key token data from `k_ptr` and store
-  them on register memory as `k_vecs`. We use register memory for
-  `k_vecs` because it will only be accessed by one thread once,
-  whereas `q_vecs` will be accessed by multiple threads multiple
-  times. Each `k_vecs` will contain multiple vectors for later
-  calculation. Each vec will be set at each inner iteration. The
-  assignment of vecs allows neighboring threads in a warp to read
-  neighboring memory together, which again promotes the memory
-  coalescing. For instance, thread 0 will read vec 0, while thread 1
-  will read vec 1. In the next inner loop, thread 0 will read vec 2,
-  while thread 1 will read vec 3, and so on.
- You may still be a little confused about the overall flow. Don't
-  worry, please keep reading the next "QK" section. It will illustrate
-  the query and key calculation flow in a clearer and higher-level
-  manner.
-## QK
- As shown the pseudo code below, before the entire for loop block, we
-  fetch the query data for one token and store it in `q_vecs`. Then,
-  in the outer for loop, we iterate through different `k_ptrs` that
-  point to different tokens and prepare the `k_vecs` in the inner for
-  loop. Finally, we perform the dot multiplication between the
-  `q_vecs` and each `k_vecs`.
-  ```cpp
-  q_vecs = ...
-  for ... {
-     k_ptr = ...
-     for ... {
-        k_vecs[i] = ...
-     }
-     ...
-     float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
-  }
-  ```
- As mentioned before, for each thread, it only fetches part of the
-  query and key token data at a time. However, there will be a cross
-  thread group reduction happen in the `Qk_dot<>::dot` . So `qk`
-  returned here is not just between part of the query and key token dot
-  multiplication, but actually a full result between entire query and
-  key token data.
- For example, if the value of `HEAD_SIZE` is 128 and
-  `THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain
-  total 64 elements. However, the returned `qk` is actually the
-  result of dot multiplication between 128 query elements and 128 key
-  elements. If you want to learn more about the details of the dot
-  multiplication and reduction, you may refer to the implementation of
-  `Qk_dot<>::dot`. However, for the sake of simplicity, I will not
-  cover it in this document.
-## Softmax
- Next, we need to calculate the normalized softmax for all `qk`s,
-  as shown above, where each $x$ represents a `qk`. To do this,
-  we must obtain the reduced value of `qk_max`($m(x)$) and
-  the `exp_sum`($\ell(x)$) of all `qk`s. The reduction
-  should be performed across the entire thread block, encompassing
-  results between the query token and all context key tokens.
-  :::{math}
-  :nowrap: true
-  \begin{gather*}
-  m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
-  \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
-  \end{gather*}
-  :::
-### `qk_max` and `logits`
- Just right after we get the `qk` result, we can set the temporary
-  `logits` result with `qk` (In the end, the `logits` should
-  store the normalized softmax result). Also we can compare and collect
-  the `qk_max` for all `qk`s that are calculated by current
-  thread group.
-  ```cpp
-  if (thread_group_offset == 0) {
-     const bool mask = token_idx >= context_len;
-     logits[token_idx - start_token_idx] = mask ? 0.f : qk;
-     qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-  }
-  ```
- Please note that the `logits` here is on shared memory, so each
-  thread group will set the fields for its own assigned context tokens.
-  Overall, the size of logits should be number of context tokens.
-  ```cpp
-  for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
-      qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
-  }
-  if (lane == 0) {
-     red_smem[warp_idx] = qk_max;
-  }
-  ```
- Then we need to get the reduced `qk_max` across each warp. The main
-  idea is to make threads in warp to communicate with each other and
-  get the final max `qk` .
-  ```cpp
-  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
-      qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
-  }
-  qk_max = VLLM_SHFL_SYNC(qk_max, 0);
-  ```
- Finally, we can get the reduced `qk_max` from whole thread block by
-  compare the `qk_max` from all warps in this thread block. Then we
-  need to broadcast the final result to each thread.
-### `exp_sum`
- Similar to `qk_max`, we need to get the reduced sum value from the
-  entire thread block too.
-  ```cpp
-  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
-      float val = __expf(logits[i] - qk_max);
-      logits[i] = val;
-      exp_sum += val;
-  }
-  ...
-  exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
-  ```
- Firstly, sum all exp values from each thread group, and meanwhile,
-  convert each entry of `logits` from `qk` to `exp(qk - qk_max)`.
-  Please note, the `qk_max` here is already the max `qk` across the
-  whole thread block. And then we can do reduction for `exp_sum`
-  across whole thread block just like the `qk_max`.
-  ```cpp
-  const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
-  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
-     logits[i] *= inv_sum;
-  }
-  ```
- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain
-  the final normalized softmax result as `logits`. This `logits`
-  variable will be used for dot multiplication with the value data in
-  later steps. Now, it should store the normalized softmax result of
-  `qk` for all assigned context tokens.
-## Value
-:::{figure} ../../assets/kernel/value.png
-:align: center
-:alt: value
-:width: 70%
-Value data of all context tokens at one head
-:::
-:::{figure} ../../assets/kernel/logits_vec.png
-:align: center
-:alt: logits_vec
-:width: 50%
-`logits_vec` for one thread
-:::
-:::{figure} ../../assets/kernel/v_vec.png
-:align: center
-:alt: v_vec
-:width: 70%
-List of `v_vec` for one thread
-:::
- Now we need to retrieve the value data and perform dot multiplication
-  with `logits`. Unlike query and key, there is no thread group
-  concept for value data. As shown in diagram, different from key token
-  memory layout, elements from the same column correspond to the same
-  value token. For one block of value data, there are `HEAD_SIZE` of
-  rows and `BLOCK_SIZE` of columns that are split into multiple
-  `v_vecs`.
- Each thread always fetches `V_VEC_SIZE` elements from the same
-  `V_VEC_SIZE` of tokens at a time. As a result, a single thread
-  retrieves multiple `v_vec`s from different rows and the same
-  columns through multiple inner iterations. For each `v_vec`, it
-  needs to be dot multiplied with the corresponding `logits_vec`,
-  which is also `V_VEC_SIZE` elements from `logits`. Overall, with
-  multiple inner iterations, each warp will process one block of value
-  tokens. And with multiple outer iterations, the whole context value
-  tokens are processed
-  ```cpp
-  float accs[NUM_ROWS_PER_THREAD];
-  for ... { // Iteration over different blocks.
-      logits_vec = ...
-      for ... { // Iteration over different rows.
-          v_vec = ...
-          ...
-          accs[i] += dot(logits_vec, v_vec);
-      }
-  }
-  ```
- As shown in the above pseudo code, in the outer loop, similar to
-  `k_ptr`, `logits_vec` iterates over different blocks and reads
-  `V_VEC_SIZE` elements from `logits`. In the inner loop, each
-  thread reads `V_VEC_SIZE` elements from the same tokens as a
-  `v_vec` and performs dot multiplication. It is important to note
-  that in each inner iteration, the thread fetches different head
-  position elements for the same tokens. The dot result is then
-  accumulated in `accs`. Therefore, each entry of `accs` is mapped
-  to a head position assigned to the current thread.
- For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each
-  thread fetches 8 value elements for 8 tokens at a time. Each element
-  is from different tokens at the same head position. If `HEAD_SIZE`
-  is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to
-  fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are
-  a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle
-  a whole block of value tokens. And each `accs` in each thread
-  contains 8 elements that accumulated at 8 different head positions.
-  For the thread 0, the `accs` variable will have 8 elements, which
-  are 0th, 32th … 224th elements of a value head that are accumulated
-  from all assigned 8 tokens.
-## LV
- Now, we need to perform reduction for `accs` within each warp. This
-  process allows each thread to accumulate the `accs` for the
-  assigned head positions of all tokens in one block.
-  ```cpp
-  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-     float acc = accs[i];
-     for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
-        acc += VLLM_SHFL_XOR_SYNC(acc, mask);
-     }
-     accs[i] = acc;
-  }
-  ```
- Next, we perform reduction for `accs` across all warps, allowing
-  each thread to have the accumulation of `accs` for the assigned
-  head positions of all context tokens. Please note that each `accs`
-  in every thread only stores the accumulation for a portion of
-  elements of the entire head for all context tokens. However, overall,
-  all results for output have been calculated but are just stored in
-  different thread register memory.
-  ```cpp
-  float* out_smem = reinterpret_cast<float*>(shared_mem);
-  for (int i = NUM_WARPS; i > 1; i /= 2) {
-      // Upper warps write to shared memory.
-      ...
-          float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
-          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-                  ...
-          dst[row_idx] = accs[i];
-      }
-      // Lower warps update the output.
-          const float* src = &out_smem[warp_idx * HEAD_SIZE];
-      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-                  ...
-          accs[i] += src[row_idx];
-      }
-          // Write out the accs.
-  }
-  ```
-## Output
- Now we can write all of calculated result from local register memory
-  to final output global memory.
-  ```cpp
-  scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
-                  + head_idx * max_num_partitions * HEAD_SIZE
-                  + partition_idx * HEAD_SIZE;
-  ```
- First, we need to define the `out_ptr` variable, which points to
-  the start address of the assigned sequence and assigned head.
-  ```cpp
-  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-  const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
-  if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
-      from_float(*(out_ptr + row_idx), accs[i]);
-  }
-  }
-  ```
- Finally, we need to iterate over different assigned head positions
-  and write out the corresponding accumulated result based on the
-  `out_ptr`.
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
-(compatibility-matrix)=
-# Compatibility Matrix
-The tables below show mutually exclusive features and the support on some hardware.
-The symbols used have the following meanings:
- ✅ = Full compatibility
- 🟠 = Partial compatibility
- ❌ = No compatibility
-:::{note}
-Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
-:::
-## Feature x Feature
-:::{raw} html
-<style>
-  /* Make smaller to try to improve readability  */
-  td {
-    font-size: 0.8rem;
-    text-align: center;
-  }
-  th {
-    text-align: center;
-    font-size: 0.8rem;
-  }
-</style>
-:::
-:::{list-table}
-:header-rows: 1
-:stub-columns: 1
-:widths: auto
-:class: vertical-table-header
- * Feature
-  * [CP](#chunked-prefill)
-  * [APC](#automatic-prefix-caching)
-  * [LoRA](#lora-adapter)
-  * <abbr title="Prompt Adapter">prmpt adptr</abbr>
-  * [SD](#spec_decode)
-  * CUDA graph
-  * <abbr title="Pooling Models">pooling</abbr>
-  * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-  * <abbr title="Logprobs">logP</abbr>
-  * <abbr title="Prompt Logprobs">prmpt logP</abbr>
-  * <abbr title="Async Output Processing">async output</abbr>
-  * multi-step
-  * <abbr title="Multimodal Inputs">mm</abbr>
-  * best-of
-  * beam-search
-  * <abbr title="Guided Decoding">guided dec</abbr>
- * [CP](#chunked-prefill)
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
- * [APC](#automatic-prefix-caching)
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
- * [LoRA](#lora-adapter)
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
- * [SD](#spec_decode)
-  * ✅
-  * ✅
-  * ❌
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
- * CUDA graph
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
- * <abbr title="Pooling Models">pooling</abbr>
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-  * ❌
-  * [❌](gh-issue:7366)
-  * ❌
-  * ❌
-  * [❌](gh-issue:7366)
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
- * <abbr title="Logprobs">logP</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❌
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❌
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
- * <abbr title="Async Output Processing">async output</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❌
-  * ✅
-  * ❌
-  * ❌
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
- * multi-step
-  * ❌
-  * ✅
-  * ❌
-  * ✅
-  * ❌
-  * ✅
-  * ❌
-  * ❌
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
- * <abbr title="Multimodal Inputs">mm</abbr>
-  * ✅
-  * [🟠](gh-pr:8348)
-  * [🟠](gh-pr:4194)
-  * ❔
-  * ❔
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❔
-  * ✅
-  *
-  *
-  *
- * best-of
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * [❌](gh-issue:6137)
-  * ✅
-  * ❌
-  * ✅
-  * ✅
-  * ✅
-  * ❔
-  * [❌](gh-issue:7968)
-  * ✅
-  * ✅
-  *
-  *
- * beam-search
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * [❌](gh-issue:6137)
-  * ✅
-  * ❌
-  * ✅
-  * ✅
-  * ✅
-  * ❔
-  * [❌](gh-issue:7968)
-  * ❔
-  * ✅
-  * ✅
-  *
- * <abbr title="Guided Decoding">guided dec</abbr>
-  * ✅
-  * ✅
-  * ❔
-  * ❔
-  * [❌](gh-issue:11484)
-  * ✅
-  * ❌
-  * ❔
-  * ✅
-  * ✅
-  * ✅
-  * [❌](gh-issue:9893)
-  * ❔
-  * ✅
-  * ✅
-  * ✅
-:::
-(feature-x-hardware)=
-## Feature x Hardware
-:::{list-table}
-:header-rows: 1
-:stub-columns: 1
-:widths: auto
- * Feature
-  * Volta
-  * Turing
-  * Ampere
-  * Ada
-  * Hopper
-  * CPU
-  * AMD
- * [CP](#chunked-prefill)
-  * [❌](gh-issue:2729)
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
- * [APC](#automatic-prefix-caching)
-  * [❌](gh-issue:3687)
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
- * [LoRA](#lora-adapter)
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * [❌](gh-issue:8475)
-  * ✅
- * [SD](#spec_decode)
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
- * CUDA graph
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❌
-  * ✅
- * <abbr title="Pooling Models">pooling</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❔
- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❌
- * <abbr title="Multimodal Inputs">mm</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
- * <abbr title="Logprobs">logP</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
- * <abbr title="Async Output Processing">async output</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❌
-  * ❌
- * multi-step
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * [❌](gh-issue:8477)
-  * ✅
- * best-of
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
- * beam-search
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
- * <abbr title="Guided Decoding">guided dec</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-:::
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
-(quantization-index)=
-# Quantization
-Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
-:::{toctree}
-:caption: Contents
-:maxdepth: 1
-supported_hardware
-auto_awq
-bnb
-bitblas
-gguf
-gptqmodel
-int4
-int8
-fp8
-quark
-quantized_kvcache
-torchao
-:::
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
-(quantization-supported-hardware)=
-# Supported Hardware
-The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
-:::{list-table}
-:header-rows: 1
-:widths: 20 8 8 8 8 8 8 8 8 8 8
- * Implementation
-  * Volta
-  * Turing
-  * Ampere
-  * Ada
-  * Hopper
-  * AMD GPU
-  * Intel GPU
-  * x86 CPU
-  * AWS Inferentia
-  * Google TPU
- * AWQ
-  * ❌
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
- * GPTQ
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
- * Marlin (GPTQ/AWQ/FP8)
-  * ❌
-  * ❌
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
- * INT8 (W8A8)
-  * ❌
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ✅︎
-  * ❌
-  * ✅︎
- * FP8 (W8A8)
-  * ❌
-  * ❌
-  * ❌
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
- * BitBLAS (GPTQ)
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
- * AQLM
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
- * bitsandbytes
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
- * DeepSpeedFP
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
- * GGUF
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-:::
- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
- ✅︎ indicates that the quantization method is supported on the specified hardware.
- ❌ indicates that the quantization method is not supported on the specified hardware.
-:::{note}
-This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
-For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
-:::
--- a/docs/source/getting_started/installation.md
+++ b/docs/source/getting_started/installation.md
-(installation-index)=
-# Installation
-vLLM supports the following hardware platforms:
-:::{toctree}
-:maxdepth: 1
-:hidden:
-installation/gpu
-installation/cpu
-installation/ai_accelerator
-:::
- <project:installation/gpu.md>
-  - NVIDIA CUDA
-  - AMD ROCm
-  - Intel XPU
- <project:installation/cpu.md>
-  - Intel/AMD x86
-  - ARM AArch64
-  - Apple silicon
-  - IBM Z (S390X)
- <project:installation/ai_accelerator.md>
-  - Google TPU
-  - Intel Gaudi
-  - AWS Neuron
--- a/docs/source/getting_started/installation/ai_accelerator.md
+++ b/docs/source/getting_started/installation/ai_accelerator.md
-# Other AI accelerators
-vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions:
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} Google TPU
-:selected:
-:sync: tpu
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-::::
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-::::
-::::{tab-item} AWS Neuron
-:sync: neuron
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-::::
-:::::
-## Requirements
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} Google TPU
-:sync: tpu
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "## Requirements"
-:end-before: "## Configure a new environment"
-:::
-::::
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "## Requirements"
-:end-before: "## Configure a new environment"
-:::
-::::
-::::{tab-item} AWS Neuron
-:sync: neuron
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "## Requirements"
-:end-before: "## Configure a new environment"
-:::
-::::
-:::::
-## Configure a new environment
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} Google TPU
-:sync: tpu
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "## Configure a new environment"
-:end-before: "## Set up using Python"
-:::
-::::
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "## Configure a new environment"
-:end-before: "## Set up using Python"
-:::
-::::
-::::{tab-item} AWS Neuron
-:sync: neuron
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "## Configure a new environment"
-:end-before: "## Set up using Python"
-:::
-::::
-:::::
-## Set up using Python
-### Pre-built wheels
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} Google TPU
-:sync: tpu
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-::::
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-::::
-::::{tab-item} AWS Neuron
-:sync: neuron
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-::::
-:::::
-### Build wheel from source
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} Google TPU
-:sync: tpu
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-::::
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-::::
-::::{tab-item} AWS Neuron
-:sync: neuron
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-::::
-:::::
-## Set up using Docker
-### Pre-built images
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} Google TPU
-:sync: tpu
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-::::
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-::::
-::::{tab-item} AWS Neuron
-:sync: neuron
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-::::
-:::::
-### Build image from source
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} Google TPU
-:sync: tpu
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-:::
-::::
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-:::
-::::
-::::{tab-item} AWS Neuron
-:sync: neuron
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-:::
-::::
-:::::
-## Extra information
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} Google TPU
-:sync: tpu
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "## Extra information"
-:::
-::::
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "## Extra information"
-:::
-::::
-::::{tab-item} AWS Neuron
-:sync: neuron
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "## Extra information"
-:::
-::::
-:::::
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
-# Installation
-vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
-Paged Attention and Chunked Prefill are currently in development and will be available soon.
-Data types currently supported in Neuron SDK are FP16 and BF16.
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
-## Requirements
- OS: Linux
- Python: 3.9 -- 3.11
- Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
- Pytorch 2.0.1/2.1.1
- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
-## Configure a new environment
-### Launch Trn1/Inf2 instances
-Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html).
- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/)
- Select Ubuntu Server 22.04 TLS AMI
- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
-### Install drivers and tools
-The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
-```console
-# Configure Linux for Neuron repository updates
-. /etc/os-release
-sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
-deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
-EOF
-wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
-# Update OS packages
-sudo apt-get update -y
-# Install OS headers
-sudo apt-get install linux-headers-$(uname -r) -y
-# Install git
-sudo apt-get install git -y
-# install Neuron Driver
-sudo apt-get install aws-neuronx-dkms=2.* -y
-# Install Neuron Runtime
-sudo apt-get install aws-neuronx-collectives=2.* -y
-sudo apt-get install aws-neuronx-runtime-lib=2.* -y
-# Install Neuron Tools
-sudo apt-get install aws-neuronx-tools=2.* -y
-# Add PATH
-export PATH=/opt/aws/neuron/bin:$PATH
-```
-## Set up using Python
-### Pre-built wheels
-Currently, there are no pre-built Neuron wheels.
-### Build wheel from source
-:::{note}
-The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-:::
-Following instructions are applicable to Neuron SDK 2.16 and beyond.
-#### Install transformers-neuronx and its dependencies
-[transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances.
-Follow the steps below to install transformer-neuronx package and its dependencies.
-```console
-# Install Python venv
-sudo apt-get install -y python3.10-venv g++
-# Create Python venv
-python3.10 -m venv aws_neuron_venv_pytorch
-# Activate Python venv
-source aws_neuron_venv_pytorch/bin/activate
-# Install Jupyter notebook kernel
-pip install ipykernel
-python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
-pip install jupyter notebook
-pip install environment_kernels
-# Set pip repository pointing to the Neuron repository
-python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
-# Install wget, awscli
-python -m pip install wget
-python -m pip install awscli
-# Update Neuron Compiler and Framework
-python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
-```
-#### Install vLLM from source
-Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
-```console
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -U -r requirements/neuron.txt
-VLLM_TARGET_DEVICE="neuron" pip install .
-```
-If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed.
-## Set up using Docker
-### Pre-built images
-Currently, there are no pre-built Neuron images.
-### Build image from source
-See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.
-Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
-## Extra information
-There is no extra information for this device.
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
-# Installation
-vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16.
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
-## Requirements
- OS: Linux
- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
- Instruction Set Architecture (ISA): AVX512 (optional, recommended)
-:::{tip}
-[Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
-:::
-## Set up using Python
-### Pre-built wheels
-### Build wheel from source
-:::{include} cpu/build.inc.md
-:::
-:::{note}
- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
-:::
-## Set up using Docker
-### Pre-built images
-See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
-### Build image from source
-## Extra information
--- a/docs/source/getting_started/installation/gpu.md
+++ b/docs/source/getting_started/installation/gpu.md
-# GPU
-vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} NVIDIA CUDA
-:selected:
-:sync: cuda
-:::{include} gpu/cuda.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-::::
-::::{tab-item} AMD ROCm
-:sync: rocm
-:::{include} gpu/rocm.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-::::
-::::{tab-item} Intel XPU
-:sync: xpu
-:::{include} gpu/xpu.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-::::
-:::::
-## Requirements
- OS: Linux
- Python: 3.9 -- 3.12
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-:::{include} gpu/cuda.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
-::::
-::::{tab-item} AMD ROCm
-:sync: rocm
-:::{include} gpu/rocm.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
-::::
-::::{tab-item} Intel XPU
-:sync: xpu
-:::{include} gpu/xpu.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
-::::
-:::::
-## Set up using Python
-### Create a new Python environment
-:::{include} python_env_setup.inc.md
-:::
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-:::{include} gpu/cuda.inc.md
-:start-after: "## Create a new Python environment"
-:end-before: "### Pre-built wheels"
-:::
-::::
-::::{tab-item} AMD ROCm
-:sync: rocm
-There is no extra information on creating a new Python environment for this device.
-::::
-::::{tab-item} Intel XPU
-:sync: xpu
-There is no extra information on creating a new Python environment for this device.
-::::
-:::::
-### Pre-built wheels
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-:::{include} gpu/cuda.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-::::
-::::{tab-item} AMD ROCm
-:sync: rocm
-:::{include} gpu/rocm.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-::::
-::::{tab-item} Intel XPU
-:sync: xpu
-:::{include} gpu/xpu.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-::::
-:::::
-(build-from-source)=
-### Build wheel from source
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-:::{include} gpu/cuda.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-::::
-::::{tab-item} AMD ROCm
-:sync: rocm
-:::{include} gpu/rocm.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-::::
-::::{tab-item} Intel XPU
-:sync: xpu
-:::{include} gpu/xpu.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-::::
-:::::
-## Set up using Docker
-### Pre-built images
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-:::{include} gpu/cuda.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-::::
-::::{tab-item} AMD ROCm
-:sync: rocm
-:::{include} gpu/rocm.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-::::
-::::{tab-item} Intel XPU
-:sync: xpu
-:::{include} gpu/xpu.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-::::
-:::::
-### Build image from source
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-:::{include} gpu/cuda.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Supported features"
-:::
-::::
-::::{tab-item} AMD ROCm
-:sync: rocm
-:::{include} gpu/rocm.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Supported features"
-:::
-::::
-::::{tab-item} Intel XPU
-:sync: xpu
-:::{include} gpu/xpu.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Supported features"
-:::
-::::
-:::::
-## Supported features
-:::::{tab-set}
-:sync-group: device
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-:::{include} gpu/cuda.inc.md
-:start-after: "## Supported features"
-:::
-::::
-::::{tab-item} AMD ROCm
-:sync: rocm
-:::{include} gpu/rocm.inc.md
-:start-after: "## Supported features"
-:::
-::::
-::::{tab-item} Intel XPU
-:sync: xpu
-:::{include} gpu/xpu.inc.md
-:start-after: "## Supported features"
-:::
-::::
-:::::
--- a/docs/source/getting_started/installation/python_env_setup.inc.md
+++ b/docs/source/getting_started/installation/python_env_setup.inc.md
-You can create a new Python environment using [conda](https://docs.conda.io/projects/conda/en/stable/user-guide/getting-started.html):
-```console
-# (Recommended) Create a new conda environment.
-conda create -n vllm python=3.12 -y
-conda activate vllm
-```
-:::{note}
-[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages.
-:::
-Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
-```console
-# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
-uv venv vllm --python 3.12 --seed
-source vllm/bin/activate
-```