Unverified Commit d6484ef3 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Add full API docs and improve the UX of navigating them (#17485)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 46fae69c
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
from itertools import groupby from itertools import groupby
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Optional, TypeVar, Union from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
from urllib.parse import ParseResult, urlparse from urllib.parse import ParseResult, urlparse
import numpy as np import numpy as np
...@@ -24,6 +24,10 @@ _M = TypeVar("_M") ...@@ -24,6 +24,10 @@ _M = TypeVar("_M")
if TYPE_CHECKING: if TYPE_CHECKING:
from .hasher import MultiModalHashDict from .hasher import MultiModalHashDict
from .inputs import MultiModalKwargs, MultiModalPlaceholderDict from .inputs import MultiModalKwargs, MultiModalPlaceholderDict
else:
MultiModalHashDict = Any
MultiModalKwargs = Any
MultiModalPlaceholderDict = Any
class MediaConnector: class MediaConnector:
...@@ -255,7 +259,7 @@ class MediaConnector: ...@@ -255,7 +259,7 @@ class MediaConnector:
global_media_connector = MediaConnector() global_media_connector = MediaConnector()
"""The global :class:`MediaConnector` instance used by vLLM.""" """The global {class}`MediaConnector` instance used by vLLM."""
fetch_audio = global_media_connector.fetch_audio fetch_audio = global_media_connector.fetch_audio
fetch_image = global_media_connector.fetch_image fetch_image = global_media_connector.fetch_image
...@@ -293,24 +297,24 @@ def encode_video_base64(frames: npt.NDArray) -> str: ...@@ -293,24 +297,24 @@ def encode_video_base64(frames: npt.NDArray) -> str:
def merge_and_sort_multimodal_metadata( def merge_and_sort_multimodal_metadata(
mm_positions: "MultiModalPlaceholderDict", mm_positions: MultiModalPlaceholderDict,
mm_hashes: Optional["MultiModalHashDict"], mm_hashes: Optional[MultiModalHashDict],
) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]: ) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
"""Given a MultiModalPlaceholderDict, merge all PlaceholderRange """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
objects from all available modalities into a single list of objects from all available modalities into a single list of
PlaceholderRange, sorted by their offset (starting index in the input PlaceholderRange, sorted by their offset (starting index in the input
sequence) in the ascending order. sequence) in the ascending order.
Optionally if a MultiModalHashDict is given, same operation will be Optionally if a `MultiModalHashDict` is given, same operation will be
applied to the object and the sorted list of hashes will be returned. applied to the object and the sorted list of hashes will be returned.
Returns: Returns:
list[str]: List of item modalities in order of their positions in list[str]: List of item modalities in order of their positions in the
the input sequence. input sequence.
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
mm_positions. mm_positions.
Optional[list[str]]: Sorted list of all hashes from mm_hashes if Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
given, None otherwise. None otherwise.
""" """
modalities = list(mm_positions.keys()) modalities = list(mm_positions.keys())
...@@ -352,7 +356,7 @@ def merge_and_sort_multimodal_metadata( ...@@ -352,7 +356,7 @@ def merge_and_sort_multimodal_metadata(
def group_mm_inputs_by_modality( def group_mm_inputs_by_modality(
mm_inputs: list["MultiModalKwargs"]) -> list[list["MultiModalKwargs"]]: mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]:
"""Group consecutive MultiModalKwargs from mm_inputs with the same modality """Group consecutive MultiModalKwargs from mm_inputs with the same modality
together into the same list for batching purpose. For MultiModalKwargs with together into the same list for batching purpose. For MultiModalKwargs with
multiple modalities, put them into their own list. multiple modalities, put them into their own list.
...@@ -361,13 +365,14 @@ def group_mm_inputs_by_modality( ...@@ -361,13 +365,14 @@ def group_mm_inputs_by_modality(
mm_inputs: List of MultiModalKwargs. mm_inputs: List of MultiModalKwargs.
Returns: Returns:
list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each list[list[vllm.multimodal.MultiModalKwargs]]: List of list of
inner list contains consecutive MultiModalKwargs with same modality. `MultiModalKwargs`, each inner list contains consecutive
`MultiModalKwargs` with same modality.
""" """
if not mm_inputs: if not mm_inputs:
return [] return []
def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]: def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]:
# If the input has multiple modalities, return a id as the unique key # If the input has multiple modalities, return a id as the unique key
# for the mm_input input. # for the mm_input input.
if len(mm_input.modalities) > 1: if len(mm_input.modalities) > 1:
......
...@@ -19,8 +19,6 @@ if TYPE_CHECKING: ...@@ -19,8 +19,6 @@ if TYPE_CHECKING:
else: else:
VllmConfig = None VllmConfig = None
logger = init_logger(__name__)
class CpuPlatform(Platform): class CpuPlatform(Platform):
_enum = PlatformEnum.CPU _enum = PlatformEnum.CPU
......
...@@ -454,10 +454,4 @@ finally: ...@@ -454,10 +454,4 @@ finally:
CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform
try: CudaPlatform.log_warnings()
from sphinx.ext.autodoc.mock import _MockModule
if not isinstance(pynvml, _MockModule):
CudaPlatform.log_warnings()
except ModuleNotFoundError:
CudaPlatform.log_warnings()
...@@ -146,7 +146,7 @@ class Platform: ...@@ -146,7 +146,7 @@ class Platform:
return self._enum == PlatformEnum.OOT return self._enum == PlatformEnum.OOT
def is_cuda_alike(self) -> bool: def is_cuda_alike(self) -> bool:
"""Stateless version of :func:`torch.cuda.is_available`.""" """Stateless version of {func}`torch.cuda.is_available`."""
return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
def is_sleep_mode_available(self) -> bool: def is_sleep_mode_available(self) -> bool:
...@@ -165,7 +165,7 @@ class Platform: ...@@ -165,7 +165,7 @@ class Platform:
cls, cls,
device_id: int = 0, device_id: int = 0,
) -> Optional[DeviceCapability]: ) -> Optional[DeviceCapability]:
"""Stateless version of :func:`torch.cuda.get_device_capability`.""" """Stateless version of {func}`torch.cuda.get_device_capability`."""
return None return None
@classmethod @classmethod
...@@ -180,7 +180,7 @@ class Platform: ...@@ -180,7 +180,7 @@ class Platform:
The ``capability`` argument can either be: The ``capability`` argument can either be:
- A tuple ``(major, minor)``. - A tuple ``(major, minor)``.
- An integer ``<major><minor>``. (See :meth:`DeviceCapability.to_int`) - An integer ``<major><minor>``. (See {meth}`DeviceCapability.to_int`)
""" """
current_capability = cls.get_device_capability(device_id=device_id) current_capability = cls.get_device_capability(device_id=device_id)
if current_capability is None: if current_capability is None:
......
# SPDX-License-Identifier: Apache-2.0
from .layerwise_profile import layerwise_profile
__all__ = [
"layerwise_profile",
]
...@@ -27,7 +27,7 @@ VLLM_INVALID_TOKEN_ID = -1 ...@@ -27,7 +27,7 @@ VLLM_INVALID_TOKEN_ID = -1
def array_full(token_id: int, count: int): def array_full(token_id: int, count: int):
""":class:`array` equivalent of :func:`numpy.full`.""" """{class}`array` equivalent of {func}`numpy.full`."""
return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
...@@ -192,11 +192,11 @@ class SequenceData(msgspec.Struct, ...@@ -192,11 +192,11 @@ class SequenceData(msgspec.Struct,
def from_prompt_token_counts( def from_prompt_token_counts(
*token_counts: tuple[int, int]) -> "SequenceData": *token_counts: tuple[int, int]) -> "SequenceData":
""" """
Construct a :class:`SequenceData` instance by concatenating Construct a {class}`SequenceData` instance by concatenating
prompt token sequences. prompt token sequences.
Each tuple represents one token sequence, expressed in the form Each tuple represents one token sequence, expressed in the form
:code:`(token_id, count)`. `(token_id, count)`.
""" """
if len(token_counts) == 0: if len(token_counts) == 0:
return SequenceData.from_seqs([]) return SequenceData.from_seqs([])
...@@ -216,7 +216,7 @@ class SequenceData(msgspec.Struct, ...@@ -216,7 +216,7 @@ class SequenceData(msgspec.Struct,
prompt_embeds: Optional[torch.Tensor] = None, prompt_embeds: Optional[torch.Tensor] = None,
) -> "SequenceData": ) -> "SequenceData":
""" """
Construct a :class:`SequenceData` instance from prompt and output Construct a {class}`SequenceData` instance from prompt and output
token sequences. token sequences.
""" """
prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE, prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
...@@ -452,9 +452,9 @@ class SequenceData(msgspec.Struct, ...@@ -452,9 +452,9 @@ class SequenceData(msgspec.Struct,
class Sequence: class Sequence:
"""Stores the data, status, and block information of a sequence. """Stores the data, status, and block information of a sequence.
The sequence is constructed from the :data:`DecoderOnlyInputs` The sequence is constructed from the {data}`DecoderOnlyInputs`
(for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder) (for decoder-only) or {data}`EncoderDecoderInputs` (for encoder-decoder)
instance passed in through the :code:`inputs` constructor argument. instance passed in through the `inputs` constructor argument.
Args: Args:
seq_id: The ID of the sequence. seq_id: The ID of the sequence.
......
...@@ -52,7 +52,8 @@ class SmallerTpProposerWorker(ProposerWorkerBase): ...@@ -52,7 +52,8 @@ class SmallerTpProposerWorker(ProposerWorkerBase):
"""Create a SmallerTpProposerWorker. """Create a SmallerTpProposerWorker.
Args: Args:
worker (MultiStepWorker): an actual worker wrapped with this class worker (~vllm.spec_decode.multi_step_worker.MultiStepWorker): an
actual worker wrapped with this class
draft_ranks (List[int]): if this value is given, only the GPU ranks draft_ranks (List[int]): if this value is given, only the GPU ranks
written in this value participate in draft generation written in this value participate in draft generation
""" """
......
...@@ -196,8 +196,7 @@ class DbrxConfig(PretrainedConfig): ...@@ -196,8 +196,7 @@ class DbrxConfig(PretrainedConfig):
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
output_router_logits (`bool`, *optional*, defaults to `False`): output_router_logits (`bool`, *optional*, defaults to `False`):
Whether or not the router logits should be returned by the model. Enabling this will also Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
allow the model to output the auxiliary loss. See [here]() for more details
router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
The aux loss factor for the total loss. The aux loss factor for the total loss.
......
...@@ -35,22 +35,22 @@ class ExaoneConfig(PretrainedConfig): ...@@ -35,22 +35,22 @@ class ExaoneConfig(PretrainedConfig):
Instantiating a configuration with the defaults will yield a similar Instantiating a configuration with the defaults will yield a similar
configuration to that of the Exaone configuration to that of the Exaone
Configuration objects inherit from :class:`~transformers.PretrainedConfig` Configuration objects inherit from {class}`~transformers.PretrainedConfig`
and can be used to control the model outputs. Read the documentation from : and can be used to control the model outputs. Read the documentation from :
class:`~transformers.PretrainedConfig` for more information. class:`~transformers.PretrainedConfig` for more information.
Args: Args:
vocab_size (:obj:`int`, `optional`, defaults to 50257): vocab_size ({obj}`int`, `optional`, defaults to 50257):
Vocabulary size of the GPT Lingvo model. Defines the number of Vocabulary size of the GPT Lingvo model. Defines the number of
different tokens that can be represented by the :obj:`inputs_ids` different tokens that can be represented by the {obj}`inputs_ids`
passed when calling :class:`~transformers.ExaoneModel`. Vocabulary passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
size of the model. size of the model.
Defines the different tokens that can be represented by the Defines the different tokens that can be represented by the
`inputs_ids` passed to the forward method of :class: `inputs_ids` passed to the forward method of :class:
`~transformers.EXAONEModel`. `~transformers.EXAONEModel`.
hidden_size (:obj:`int`, `optional`, defaults to 2048): hidden_size ({obj}`int`, `optional`, defaults to 2048):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
num_layers (:obj:`int`, `optional`, defaults to 24): num_layers ({obj}`int`, `optional`, defaults to 24):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32): num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Number of attention heads for each attention layer in the
...@@ -68,37 +68,37 @@ class ExaoneConfig(PretrainedConfig): ...@@ -68,37 +68,37 @@ class ExaoneConfig(PretrainedConfig):
specified, will default to `num_attention_heads`. specified, will default to `num_attention_heads`.
rotary_pct (`float`, *optional*, defaults to 0.25): rotary_pct (`float`, *optional*, defaults to 0.25):
percentage of hidden dimensions to allocate to rotary embeddings percentage of hidden dimensions to allocate to rotary embeddings
intermediate_size (:obj:`int`, `optional`, defaults to 8192): intermediate_size ({obj}`int`, `optional`, defaults to 8192):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in Dimensionality of the "intermediate" (i.e., feed-forward) layer in
the Transformer encoder. the Transformer encoder.
activation_function (:obj:`str` or :obj:`function`, `optional`, activation_function ({obj}`str` or {obj}`function`, `optional`,
defaults to :obj:`"gelu_new"`): defaults to {obj}`"gelu_new"`):
The non-linear activation function (function or string) in the The non-linear activation function (function or string) in the
encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`, encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
:obj:`"selu"` and :obj:`"gelu_new"` are supported. {obj}`"selu"` and {obj}`"gelu_new"` are supported.
embed_dropout (:obj:`float`, `optional`, defaults to 0.0): embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
The dropout probabilitiy for all fully connected layers in the The dropout probabilitiy for all fully connected layers in the
embeddings, encoder, and pooler. embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.0): attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, `optional`, defaults to 2048): max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case Typically set this to something large just in case
(e.g., 512 or 1024 or 2048). (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 2): type_vocab_size ({obj}`int`, `optional`, defaults to 2):
The vocabulary size of the :obj:`token_type_ids` passed when calling The vocabulary size of the {obj}`token_type_ids` passed when calling
:class:`~transformers.EXAONEModel`. {class}`~transformers.EXAONEModel`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02): initializer_range ({obj}`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for The standard deviation of the truncated_normal_initializer for
initializing all weight matrices. initializing all weight matrices.
layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5): layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
Whether or not the model should return the last key/values Whether or not the model should return the last key/values
attentions (not used by all models). attentions (not used by all models).
Only relevant if ``config.is_decoder=True``. Only relevant if ``config.is_decoder=True``.
gradient_checkpointing (:obj:`bool`, `optional`, gradient_checkpointing ({obj}`bool`, `optional`,
defaults to :obj:`False`): defaults to {obj}`False`):
If True, use gradient checkpointing to save memory at the expense If True, use gradient checkpointing to save memory at the expense
of slower backward pass. of slower backward pass.
Example:: Example::
......
...@@ -39,9 +39,9 @@ def decode_tokens( ...@@ -39,9 +39,9 @@ def decode_tokens(
) -> str: ) -> str:
""" """
Backend-agnostic equivalent of HF's Backend-agnostic equivalent of HF's
:code:`tokenizer.decode(token_ids, ...)`. `tokenizer.decode(token_ids, ...)`.
:code:`skip_special_tokens=None` means to use the backend's default `skip_special_tokens=None` means to use the backend's default
settings. settings.
""" """
if skip_special_tokens is not None: if skip_special_tokens is not None:
...@@ -61,9 +61,9 @@ def encode_tokens( ...@@ -61,9 +61,9 @@ def encode_tokens(
) -> list[int]: ) -> list[int]:
""" """
Backend-agnostic equivalent of HF's Backend-agnostic equivalent of HF's
:code:`tokenizer.encode(text, ...)`. `tokenizer.encode(text, ...)`.
:code:`add_special_tokens=None` means to use the backend's default `add_special_tokens=None` means to use the backend's default
settings. settings.
""" """
......
...@@ -309,8 +309,8 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]): ...@@ -309,8 +309,8 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
""" """
Gets the cumulative number of hits and queries against this cache. Gets the cumulative number of hits and queries against this cache.
If :code:`delta=True`, instead gets these statistics If `delta=True`, instead gets these statistics
since the last call that also passed :code:`delta=True`. since the last call that also passed `delta=True`.
""" """
info = CacheInfo(hits=self._hits, total=self._total) info = CacheInfo(hits=self._hits, total=self._total)
...@@ -983,7 +983,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]: ...@@ -983,7 +983,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:
def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]): def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
""" """
Unlike :class:`itertools.groupby`, groups are not broken by Unlike {class}`itertools.groupby`, groups are not broken by
non-contiguous data. non-contiguous data.
""" """
groups = defaultdict[_K, list[_V]](list) groups = defaultdict[_K, list[_V]](list)
...@@ -1773,14 +1773,6 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor: ...@@ -1773,14 +1773,6 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor) return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
def is_in_doc_build() -> bool:
try:
from sphinx.ext.autodoc.mock import _MockModule
return isinstance(torch, _MockModule)
except ModuleNotFoundError:
return False
def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
""" """
Import a Python file according to its file path. Import a Python file according to its file path.
...@@ -1820,10 +1812,11 @@ class _PlaceholderBase: ...@@ -1820,10 +1812,11 @@ class _PlaceholderBase:
Disallows downstream usage of placeholder modules. Disallows downstream usage of placeholder modules.
We need to explicitly override each dunder method because We need to explicitly override each dunder method because
:meth:`__getattr__` is not called when they are accessed. {meth}`__getattr__` is not called when they are accessed.
See also: :::{seealso}
[Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup) [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
:::
""" """
def __getattr__(self, key: str) -> Never: def __getattr__(self, key: str) -> Never:
...@@ -2052,9 +2045,6 @@ def direct_register_custom_op( ...@@ -2052,9 +2045,6 @@ def direct_register_custom_op(
library object. If you want to bind the operator to a different library, library object. If you want to bind the operator to a different library,
make sure the library object is alive when the operator is used. make sure the library object is alive when the operator is used.
""" """
if is_in_doc_build():
return
if not supports_custom_op(): if not supports_custom_op():
from vllm.platforms import current_platform from vllm.platforms import current_platform
assert not current_platform.is_cuda_alike(), ( assert not current_platform.is_cuda_alike(), (
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """
# MLA Common Components
This file implements common components for MLA implementations. This file implements common components for MLA implementations.
First we define: First we define:
......
...@@ -180,6 +180,7 @@ class KVCacheManager: ...@@ -180,6 +180,7 @@ class KVCacheManager:
as eagle. as eagle.
Blocks layout: Blocks layout:
```
----------------------------------------------------------------------- -----------------------------------------------------------------------
| < computed > | < new computed > | < new > | < pre-allocated > | | < computed > | < new computed > | < new > | < pre-allocated > |
----------------------------------------------------------------------- -----------------------------------------------------------------------
...@@ -189,6 +190,7 @@ class KVCacheManager: ...@@ -189,6 +190,7 @@ class KVCacheManager:
------------------------------------------------ ------------------------------------------------
| <new full> | | <new full> |
-------------- --------------
```
The following *_blocks are illustrated in this layout. The following *_blocks are illustrated in this layout.
Returns: Returns:
......
...@@ -308,7 +308,7 @@ class OutputProcessor: ...@@ -308,7 +308,7 @@ class OutputProcessor:
* If there is no queue (for usage with LLMEngine), * If there is no queue (for usage with LLMEngine),
return a list of RequestOutput objects. return a list of RequestOutput objects.
****************** NOTE FOR DEVELOPERS ****************** NOTE FOR DEVELOPERS
vLLM V1 minimizes the number of python loops over the full vLLM V1 minimizes the number of python loops over the full
batch to ensure system overheads are minimized. This is the batch to ensure system overheads are minimized. This is the
...@@ -316,8 +316,6 @@ class OutputProcessor: ...@@ -316,8 +316,6 @@ class OutputProcessor:
If you need to touch every element of the batch, do it from If you need to touch every element of the batch, do it from
within the loop below. within the loop below.
**********************************************************
""" """
request_outputs: list[RequestOutput] = [] request_outputs: list[RequestOutput] = []
......
...@@ -75,7 +75,7 @@ class RejectionSampler(nn.Module): ...@@ -75,7 +75,7 @@ class RejectionSampler(nn.Module):
outside of the rejection sampler with the default sampling outside of the rejection sampler with the default sampling
strategy. It allows for more flexibility in the sampling strategy. It allows for more flexibility in the sampling
process such as top_p, top_k sampling. process such as top_p, top_k sampling.
sampling_metadata (SamplingMetadata): sampling_metadata (vllm.v1.sample.metadata.SamplingMetadata):
Additional metadata needed for sampling, such as temperature, Additional metadata needed for sampling, such as temperature,
top-k/top-p parameters, or other relevant information. top-k/top-p parameters, or other relevant information.
Returns: Returns:
......
...@@ -170,9 +170,10 @@ class Worker(WorkerBase): ...@@ -170,9 +170,10 @@ class Worker(WorkerBase):
Then, it calculate the free memory that can be used for KV cache in Then, it calculate the free memory that can be used for KV cache in
bytes. bytes.
.. tip:: :::{tip}
You may limit the usage of GPU memory You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter. by adjusting the `gpu_memory_utilization` parameter.
:::
""" """
torch.cuda.empty_cache() torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats() torch.cuda.reset_peak_memory_stats()
......
...@@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs( ...@@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs(
) -> None: ) -> None:
""" """
Perform sanity checks for the result of Perform sanity checks for the result of
:meth:`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`. {meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
""" """
assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), ( assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
"Expected multimodal embeddings to be a list/tuple of 2D tensors, " "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
...@@ -39,7 +39,7 @@ def scatter_mm_placeholders( ...@@ -39,7 +39,7 @@ def scatter_mm_placeholders(
Scatter the multimodal embeddings into a contiguous tensor that represents Scatter the multimodal embeddings into a contiguous tensor that represents
the placeholder tokens. the placeholder tokens.
:class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`. {class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
Args: Args:
embeds: The multimodal embeddings. embeds: The multimodal embeddings.
...@@ -66,7 +66,7 @@ def gather_mm_placeholders( ...@@ -66,7 +66,7 @@ def gather_mm_placeholders(
""" """
Reconstructs the embeddings from the placeholder tokens. Reconstructs the embeddings from the placeholder tokens.
This is the operation of :func:`scatter_mm_placeholders`. This is the operation of {func}`scatter_mm_placeholders`.
""" """
if is_embed is None: if is_embed is None:
return placeholders return placeholders
......
...@@ -201,9 +201,10 @@ class HPUWorker(LocalOrDistributedWorkerBase): ...@@ -201,9 +201,10 @@ class HPUWorker(LocalOrDistributedWorkerBase):
Then, it calculate the maximum possible number of GPU and CPU blocks Then, it calculate the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory. that can be allocated with the remaining free memory.
.. tip:: :::{tip}
You may limit the usage of GPU memory You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter. by adjusting the `gpu_memory_utilization` parameter.
:::
""" """
# Profile the memory usage of the model and get the maximum number of # Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory. # cache blocks that can be allocated with the remaining free memory.
......
...@@ -734,11 +734,11 @@ def _pythonize_sampler_output( ...@@ -734,11 +734,11 @@ def _pythonize_sampler_output(
cache: Optional[PythonizationCache], cache: Optional[PythonizationCache],
) -> None: ) -> None:
""" This function is only called when the output tensors are ready. """ This function is only called when the output tensors are ready.
See :class:`ModelOutput`. See {class}`ModelOutput`.
Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
adding a Pythonized output data structure adding a Pythonized output data structure
(:class:`CompletionSequenceGroupOutput`) for each :class:`SequenceGroup`. ({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`.
Args: Args:
model_input model_input
......
...@@ -230,9 +230,10 @@ class Worker(LocalOrDistributedWorkerBase): ...@@ -230,9 +230,10 @@ class Worker(LocalOrDistributedWorkerBase):
Then, it calculate the maximum possible number of GPU and CPU blocks Then, it calculate the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory. that can be allocated with the remaining free memory.
.. tip:: :::{tip}
You may limit the usage of GPU memory You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter. by adjusting the `gpu_memory_utilization` parameter.
:::
""" """
# Profile the memory usage of the model and get the maximum number of # Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory. # cache blocks that can be allocated with the remaining free memory.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment