Unverified Commit 4e4d017b authored by Hyogeun Oh (오효근)'s avatar Hyogeun Oh (오효근) Committed by GitHub
Browse files

[Docs] Fix warnings in `mkdocs build` (continued) (#23743)


Signed-off-by: default avatarZerohertz <ohg3417@gmail.com>
Signed-off-by: default avatarHyogeun Oh (오효근) <ohg3417@gmail.com>
parent dd589322
......@@ -207,7 +207,7 @@ class NaiveBlockAllocator(BlockAllocator):
Args:
absolute_id (int): The absolute block id for the block
in whole allocator.
in whole allocator.
Returns:
int: The zero-offset block id on certain device.
......
......@@ -61,7 +61,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
Args:
num_blocks (int): The total number of blocks to manage.
block_size (int): The size of each block in tokens.
block_ids(Optional[Iterable[int]], optional): An optional iterable of
block_ids (Optional[Iterable[int]], optional): An optional iterable of
block IDs. If not provided, block IDs will be assigned sequentially
from 0 to num_blocks - 1.
"""
......
......@@ -657,7 +657,7 @@ class Scheduler:
`budget.num_batched_tokens` has not enough capacity to schedule
all tokens.
partial_prefill_metadata: information about the partial prefills
that are currently running
that are currently running
Returns:
SchedulerRunningOutputs.
......
......@@ -491,7 +491,8 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
kv_cache: shape =
[2, num_blocks, block_size * num_kv_heads * head_size]
NOTE: kv_cache will be an empty tensor with shape [0]
for profiling run.
attn_metadata: Metadata for attention.
......
......@@ -438,7 +438,8 @@ class FlashAttentionImpl(AttentionImpl):
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
......
......@@ -637,11 +637,9 @@ class FlashInferImpl(AttentionImpl):
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache: shape -
# NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
# HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
kv_cache: KV cache tensor with different possible shapes:
- NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
- HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
......
......@@ -689,7 +689,8 @@ class FlexAttentionImpl(AttentionImpl):
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
......
......@@ -235,7 +235,8 @@ class PallasAttentionBackendImpl(AttentionImpl):
query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
kv_cache: shape =
[num_blocks, block_size, num_kv_heads * 2, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
......@@ -329,7 +330,7 @@ def write_to_kv_cache(
Args:
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
kv_cache: shape = [num_blocks, block_size, num_kv_heads * 2, head_size]
num_slices_per_kv_cache_update_block: int
"""
_, page_size, num_combined_kv_heads, head_size = kv_cache.shape
......
......@@ -429,7 +429,8 @@ class AiterFlashAttentionImpl(AttentionImpl):
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
......
......@@ -362,7 +362,8 @@ class TreeAttentionImpl(AttentionImpl):
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
......
......@@ -285,7 +285,8 @@ class TritonAttentionImpl(AttentionImpl):
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
......
......@@ -330,7 +330,8 @@ class XFormersAttentionImpl(AttentionImpl):
query: shape = [num_tokens, num_heads, head_size]
key: shape = [num_tokens, num_kv_heads, head_size]
value: shape = [num_tokens, num_kv_heads, head_size]
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
kv_cache: shape =
[2, num_blocks, block_size, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
......
......@@ -255,9 +255,9 @@ def compute_encoder_budget(
Returns:
- Compute budget for encoder execution, measured in number of tokens
from the input sequence.
from the input sequence.
- Space budget for encoder cache size, measured in number of tokens
from the input sequence.
from the input sequence.
"""
if mm_registry.supports_multimodal_inputs(model_config):
max_tokens_by_modality = mm_registry \
......@@ -303,9 +303,9 @@ def compute_mm_encoder_budget(
Returns:
- Compute budget for encoder execution, measured in number of tokens
from the input sequence.
from the input sequence.
- Space budget for encoder cache size, measured in number of tokens
from the input sequence.
from the input sequence.
"""
if not max_tokens_by_modality:
......
......@@ -119,7 +119,8 @@ class KVCacheCoordinator(ABC):
Args:
request: The request.
num_tokens: The total number of tokens that need to be cached
num_computed_tokens: The total number of tokens
that need to be cached
(including tokens that are already cached).
"""
for manager in self.single_type_managers:
......
......@@ -54,14 +54,15 @@ class KVCacheBlocks:
def get_block_ids(
self,
allow_none: bool = False,
):
) -> Optional[tuple[list[int], ...]]:
"""
Converts the KVCacheBlocks instance to block_ids.
Returns:
tuple[list[int], ...]: A tuple of lists where
* the outer tuple corresponds to KV cache groups
* each inner list contains the block_ids of the blocks in that group
tuple[list[int], ...]: A tuple of lists where:
- the outer tuple corresponds to KV cache groups
- each inner list contains the block_ids of the blocks in that
group
"""
if allow_none and all(len(group) == 0 for group in self.blocks):
return None
......
......@@ -8,6 +8,7 @@ from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
from vllm.executor.ray_distributed_executor import ( # noqa
RayDistributedExecutor as RayDistributedExecutorV0)
from vllm.logger import init_logger
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
from vllm.v1.executor.abstract import Executor
from vllm.v1.outputs import ModelRunnerOutput
......@@ -64,7 +65,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
def execute_model(
self,
scheduler_output,
scheduler_output: SchedulerOutput,
) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
"""Execute the model on the Ray workers.
......
......@@ -36,7 +36,7 @@ def setup_multiprocess_prometheus():
"and vLLM will properly handle cleanup.")
def get_prometheus_registry():
def get_prometheus_registry() -> CollectorRegistry:
"""Get the appropriate prometheus registry based on multiprocessing
configuration.
......
......@@ -91,7 +91,7 @@ class LogitsProcessor(ABC):
to each forward pass.
Args:
batch_update is non-None iff there have been
changes to the batch makeup.
batch_update: Non-None iff there have been changes
to the batch makeup.
"""
raise NotImplementedError
......@@ -68,7 +68,7 @@ class RejectionSampler(nn.Module):
different requests are flattened into a single tensor because
this is the shape of the output logits.
NOTE: `target_logits` can be updated in place to save memory.
bonus_token_ids_tensor (torch.Tensor):
bonus_token_ids (torch.Tensor):
A tensor containing bonus tokens. Shape is [batch_size, 1].
Bonus tokens are added to the end of the sequence if all
proposed tokens are accepted. We generate the bonus tokens
......
......@@ -89,7 +89,7 @@ class Sampler(nn.Module):
Gather logprobs for topk and sampled/prompt token.
Args:
logits: (num tokens) x (vocab) tensor
logprobs: (num tokens) x (vocab) tensor
num_logprobs: minimum number of logprobs to
retain per token
token_ids: prompt tokens (if prompt logprobs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment