Unverified Commit d6484ef3 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Add full API docs and improve the UX of navigating them (#17485)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 46fae69c
......@@ -2,7 +2,7 @@
from itertools import groupby
from pathlib import Path
from typing import TYPE_CHECKING, Optional, TypeVar, Union
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
from urllib.parse import ParseResult, urlparse
import numpy as np
......@@ -24,6 +24,10 @@ _M = TypeVar("_M")
if TYPE_CHECKING:
from .hasher import MultiModalHashDict
from .inputs import MultiModalKwargs, MultiModalPlaceholderDict
else:
MultiModalHashDict = Any
MultiModalKwargs = Any
MultiModalPlaceholderDict = Any
class MediaConnector:
......@@ -255,7 +259,7 @@ class MediaConnector:
global_media_connector = MediaConnector()
"""The global :class:`MediaConnector` instance used by vLLM."""
"""The global {class}`MediaConnector` instance used by vLLM."""
fetch_audio = global_media_connector.fetch_audio
fetch_image = global_media_connector.fetch_image
......@@ -293,24 +297,24 @@ def encode_video_base64(frames: npt.NDArray) -> str:
def merge_and_sort_multimodal_metadata(
mm_positions: "MultiModalPlaceholderDict",
mm_hashes: Optional["MultiModalHashDict"],
mm_positions: MultiModalPlaceholderDict,
mm_hashes: Optional[MultiModalHashDict],
) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
"""Given a MultiModalPlaceholderDict, merge all PlaceholderRange
objects from all available modalities into a single list of
PlaceholderRange, sorted by their offset (starting index in the input
PlaceholderRange, sorted by their offset (starting index in the input
sequence) in the ascending order.
Optionally if a MultiModalHashDict is given, same operation will be
Optionally if a `MultiModalHashDict` is given, same operation will be
applied to the object and the sorted list of hashes will be returned.
Returns:
list[str]: List of item modalities in order of their positions in
the input sequence.
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
mm_positions.
Optional[list[str]]: Sorted list of all hashes from mm_hashes if
given, None otherwise.
list[str]: List of item modalities in order of their positions in the
input sequence.
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
mm_positions.
Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
None otherwise.
"""
modalities = list(mm_positions.keys())
......@@ -352,22 +356,23 @@ def merge_and_sort_multimodal_metadata(
def group_mm_inputs_by_modality(
mm_inputs: list["MultiModalKwargs"]) -> list[list["MultiModalKwargs"]]:
"""Group consecutive MultiModalKwargs from mm_inputs with the same modality
together into the same list for batching purpose. For MultiModalKwargs with
mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]:
"""Group consecutive MultiModalKwargs from mm_inputs with the same modality
together into the same list for batching purpose. For MultiModalKwargs with
multiple modalities, put them into their own list.
Args:
mm_inputs: List of MultiModalKwargs.
Returns:
list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each
inner list contains consecutive MultiModalKwargs with same modality.
list[list[vllm.multimodal.MultiModalKwargs]]: List of list of
`MultiModalKwargs`, each inner list contains consecutive
`MultiModalKwargs` with same modality.
"""
if not mm_inputs:
return []
def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]:
def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]:
# If the input has multiple modalities, return a id as the unique key
# for the mm_input input.
if len(mm_input.modalities) > 1:
......
......@@ -19,8 +19,6 @@ if TYPE_CHECKING:
else:
VllmConfig = None
logger = init_logger(__name__)
class CpuPlatform(Platform):
_enum = PlatformEnum.CPU
......
......@@ -454,10 +454,4 @@ finally:
CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
try:
from sphinx.ext.autodoc.mock import _MockModule
if not isinstance(pynvml, _MockModule):
CudaPlatform.log_warnings()
except ModuleNotFoundError:
CudaPlatform.log_warnings()
CudaPlatform.log_warnings()
......@@ -146,7 +146,7 @@ class Platform:
return self._enum == PlatformEnum.OOT
def is_cuda_alike(self) -> bool:
"""Stateless version of :func:`torch.cuda.is_available`."""
"""Stateless version of {func}`torch.cuda.is_available`."""
return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
def is_sleep_mode_available(self) -> bool:
......@@ -165,7 +165,7 @@ class Platform:
cls,
device_id: int = 0,
) -> Optional[DeviceCapability]:
"""Stateless version of :func:`torch.cuda.get_device_capability`."""
"""Stateless version of {func}`torch.cuda.get_device_capability`."""
return None
@classmethod
......@@ -180,7 +180,7 @@ class Platform:
The ``capability`` argument can either be:
- A tuple ``(major, minor)``.
- An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`)
- An integer ``<major><minor>``. (See {meth}`DeviceCapability.to_int`)
"""
current_capability = cls.get_device_capability(device_id=device_id)
if current_capability is None:
......
# SPDX-License-Identifier: Apache-2.0
from .layerwise_profile import layerwise_profile
__all__ = [
"layerwise_profile",
]
......@@ -27,7 +27,7 @@ VLLM_INVALID_TOKEN_ID = -1
def array_full(token_id: int, count: int):
""":class:`array` equivalent of :func:`numpy.full`."""
"""{class}`array` equivalent of {func}`numpy.full`."""
return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
......@@ -192,11 +192,11 @@ class SequenceData(msgspec.Struct,
def from_prompt_token_counts(
*token_counts: tuple[int, int]) -> "SequenceData":
"""
Construct a :class:`SequenceData` instance by concatenating
Construct a {class}`SequenceData` instance by concatenating
prompt token sequences.
Each tuple represents one token sequence, expressed in the form
:code:`(token_id, count)`.
`(token_id, count)`.
"""
if len(token_counts) == 0:
return SequenceData.from_seqs([])
......@@ -216,7 +216,7 @@ class SequenceData(msgspec.Struct,
prompt_embeds: Optional[torch.Tensor] = None,
) -> "SequenceData":
"""
Construct a :class:`SequenceData` instance from prompt and output
Construct a {class}`SequenceData` instance from prompt and output
token sequences.
"""
prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
......@@ -452,9 +452,9 @@ class SequenceData(msgspec.Struct,
class Sequence:
"""Stores the data, status, and block information of a sequence.
The sequence is constructed from the :data:`DecoderOnlyInputs`
(for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder)
instance passed in through the :code:`inputs` constructor argument.
The sequence is constructed from the {data}`DecoderOnlyInputs`
(for decoder-only) or {data}`EncoderDecoderInputs` (for encoder-decoder)
instance passed in through the `inputs` constructor argument.
Args:
seq_id: The ID of the sequence.
......
......@@ -52,7 +52,8 @@ class SmallerTpProposerWorker(ProposerWorkerBase):
"""Create a SmallerTpProposerWorker.
Args:
worker (MultiStepWorker): an actual worker wrapped with this class
worker (~vllm.spec_decode.multi_step_worker.MultiStepWorker): an
actual worker wrapped with this class
draft_ranks (List[int]): if this value is given, only the GPU ranks
written in this value participate in draft generation
"""
......
......@@ -196,8 +196,7 @@ class DbrxConfig(PretrainedConfig):
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
output_router_logits (`bool`, *optional*, defaults to `False`):
Whether or not the router logits should be returned by the model. Enabling this will also
allow the model to output the auxiliary loss. See [here]() for more details
Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
The aux loss factor for the total loss.
......
......@@ -35,22 +35,22 @@ class ExaoneConfig(PretrainedConfig):
Instantiating a configuration with the defaults will yield a similar
configuration to that of the Exaone
Configuration objects inherit from :class:`~transformers.PretrainedConfig`
Configuration objects inherit from {class}`~transformers.PretrainedConfig`
and can be used to control the model outputs. Read the documentation from :
class:`~transformers.PretrainedConfig` for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 50257):
vocab_size ({obj}`int`, `optional`, defaults to 50257):
Vocabulary size of the GPT Lingvo model. Defines the number of
different tokens that can be represented by the :obj:`inputs_ids`
passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
different tokens that can be represented by the {obj}`inputs_ids`
passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
size of the model.
Defines the different tokens that can be represented by the
`inputs_ids` passed to the forward method of :class:
`~transformers.EXAONEModel`.
hidden_size (:obj:`int`, `optional`, defaults to 2048):
hidden_size ({obj}`int`, `optional`, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer.
num_layers (:obj:`int`, `optional`, defaults to 24):
num_layers ({obj}`int`, `optional`, defaults to 24):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the
......@@ -68,37 +68,37 @@ class ExaoneConfig(PretrainedConfig):
specified, will default to `num_attention_heads`.
rotary_pct (`float`, *optional*, defaults to 0.25):
percentage of hidden dimensions to allocate to rotary embeddings
intermediate_size (:obj:`int`, `optional`, defaults to 8192):
intermediate_size ({obj}`int`, `optional`, defaults to 8192):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in
the Transformer encoder.
activation_function (:obj:`str` or :obj:`function`, `optional`,
defaults to :obj:`"gelu_new"`):
activation_function ({obj}`str` or {obj}`function`, `optional`,
defaults to {obj}`"gelu_new"`):
The non-linear activation function (function or string) in the
encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
:obj:`"selu"` and :obj:`"gelu_new"` are supported.
embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
{obj}`"selu"` and {obj}`"gelu_new"` are supported.
embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the
embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
The vocabulary size of the :obj:`token_type_ids` passed when calling
:class:`~transformers.EXAONEModel`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
type_vocab_size ({obj}`int`, `optional`, defaults to 2):
The vocabulary size of the {obj}`token_type_ids` passed when calling
{class}`~transformers.EXAONEModel`.
initializer_range ({obj}`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
The epsilon used by the layer normalization layers.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
Whether or not the model should return the last key/values
attentions (not used by all models).
Only relevant if ``config.is_decoder=True``.
gradient_checkpointing (:obj:`bool`, `optional`,
defaults to :obj:`False`):
gradient_checkpointing ({obj}`bool`, `optional`,
defaults to {obj}`False`):
If True, use gradient checkpointing to save memory at the expense
of slower backward pass.
Example::
......
......@@ -39,9 +39,9 @@ def decode_tokens(
) -> str:
"""
Backend-agnostic equivalent of HF's
:code:`tokenizer.decode(token_ids, ...)`.
`tokenizer.decode(token_ids, ...)`.
:code:`skip_special_tokens=None` means to use the backend's default
`skip_special_tokens=None` means to use the backend's default
settings.
"""
if skip_special_tokens is not None:
......@@ -61,9 +61,9 @@ def encode_tokens(
) -> list[int]:
"""
Backend-agnostic equivalent of HF's
:code:`tokenizer.encode(text, ...)`.
`tokenizer.encode(text, ...)`.
:code:`add_special_tokens=None` means to use the backend's default
`add_special_tokens=None` means to use the backend's default
settings.
"""
......
......@@ -309,8 +309,8 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
"""
Gets the cumulative number of hits and queries against this cache.
If :code:`delta=True`, instead gets these statistics
since the last call that also passed :code:`delta=True`.
If `delta=True`, instead gets these statistics
since the last call that also passed `delta=True`.
"""
info = CacheInfo(hits=self._hits, total=self._total)
......@@ -983,7 +983,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:
def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
"""
Unlike :class:`itertools.groupby`, groups are not broken by
Unlike {class}`itertools.groupby`, groups are not broken by
non-contiguous data.
"""
groups = defaultdict[_K, list[_V]](list)
......@@ -1773,14 +1773,6 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
def is_in_doc_build() -> bool:
try:
from sphinx.ext.autodoc.mock import _MockModule
return isinstance(torch, _MockModule)
except ModuleNotFoundError:
return False
def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
"""
Import a Python file according to its file path.
......@@ -1820,10 +1812,11 @@ class _PlaceholderBase:
Disallows downstream usage of placeholder modules.
We need to explicitly override each dunder method because
:meth:`__getattr__` is not called when they are accessed.
{meth}`__getattr__` is not called when they are accessed.
See also:
[Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
:::{seealso}
[Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
:::
"""
def __getattr__(self, key: str) -> Never:
......@@ -2052,9 +2045,6 @@ def direct_register_custom_op(
library object. If you want to bind the operator to a different library,
make sure the library object is alive when the operator is used.
"""
if is_in_doc_build():
return
if not supports_custom_op():
from vllm.platforms import current_platform
assert not current_platform.is_cuda_alike(), (
......
# SPDX-License-Identifier: Apache-2.0
"""
# MLA Common Components
This file implements common components for MLA implementations.
First we define:
......
......@@ -180,6 +180,7 @@ class KVCacheManager:
as eagle.
Blocks layout:
```
-----------------------------------------------------------------------
| < computed > | < new computed > | < new > | < pre-allocated > |
-----------------------------------------------------------------------
......@@ -189,6 +190,7 @@ class KVCacheManager:
------------------------------------------------
| <new full> |
--------------
```
The following *_blocks are illustrated in this layout.
Returns:
......
......@@ -308,7 +308,7 @@ class OutputProcessor:
* If there is no queue (for usage with LLMEngine),
return a list of RequestOutput objects.
****************** NOTE FOR DEVELOPERS ******************
NOTE FOR DEVELOPERS
vLLM V1 minimizes the number of python loops over the full
batch to ensure system overheads are minimized. This is the
......@@ -316,8 +316,6 @@ class OutputProcessor:
If you need to touch every element of the batch, do it from
within the loop below.
**********************************************************
"""
request_outputs: list[RequestOutput] = []
......
......@@ -75,7 +75,7 @@ class RejectionSampler(nn.Module):
outside of the rejection sampler with the default sampling
strategy. It allows for more flexibility in the sampling
process such as top_p, top_k sampling.
sampling_metadata (SamplingMetadata):
sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata):
Additional metadata needed for sampling, such as temperature,
top-k/top-p parameters, or other relevant information.
Returns:
......
......@@ -170,9 +170,10 @@ class Worker(WorkerBase):
Then, it calculate the free memory that can be used for KV cache in
bytes.
.. tip::
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
:::{tip}
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
:::
"""
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
......
......@@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs(
) -> None:
"""
Perform sanity checks for the result of
:meth:`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
{meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
"""
assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
"Expected multimodal embeddings to be a list/tuple of 2D tensors, "
......@@ -39,7 +39,7 @@ def scatter_mm_placeholders(
Scatter the multimodal embeddings into a contiguous tensor that represents
the placeholder tokens.
:class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
{class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
Args:
embeds: The multimodal embeddings.
......@@ -66,7 +66,7 @@ def gather_mm_placeholders(
"""
Reconstructs the embeddings from the placeholder tokens.
This is the operation of :func:`scatter_mm_placeholders`.
This is the operation of {func}`scatter_mm_placeholders`.
"""
if is_embed is None:
return placeholders
......
......@@ -201,9 +201,10 @@ class HPUWorker(LocalOrDistributedWorkerBase):
Then, it calculate the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory.
.. tip::
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
:::{tip}
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
:::
"""
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
......
......@@ -734,11 +734,11 @@ def _pythonize_sampler_output(
cache: Optional[PythonizationCache],
) -> None:
""" This function is only called when the output tensors are ready.
See :class:`ModelOutput`.
See {class}`ModelOutput`.
Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
adding a Pythonized output data structure
(:class:`CompletionSequenceGroupOutput`) for each :class:`SequenceGroup`.
({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`.
Args:
model_input
......
......@@ -230,9 +230,10 @@ class Worker(LocalOrDistributedWorkerBase):
Then, it calculate the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory.
.. tip::
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
:::{tip}
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
:::
"""
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment