Unverified Commit 4e4d017b authored by Hyogeun Oh (오효근)'s avatar Hyogeun Oh (오효근) Committed by GitHub
Browse files

[Docs] Fix warnings in `mkdocs build` (continued) (#23743)


Signed-off-by: default avatarZerohertz <ohg3417@gmail.com>
Signed-off-by: default avatarHyogeun Oh (오효근) <ohg3417@gmail.com>
parent dd589322
......@@ -110,7 +110,7 @@ class StructuredOutputBackend(ABC):
Args:
request_type (StructuredOutputOptions): The type of structured
output request.
output request.
grammar_spec (str): The grammar specification to compile.
Returns:
......@@ -124,7 +124,7 @@ class StructuredOutputBackend(ABC):
Args:
max_num_seqs (int): The maximum number of sequences for which
to allocate the bitmask.
to allocate the bitmask.
"""
@abstractmethod
......
......@@ -525,9 +525,6 @@ class InputBatch:
Any consecutive empty indices at the very end of the list are not
filled.
Args:
empty_req_indices: empty indices which may be filled.
Returns:
swaps: list of (from,to) swap tuples for moved requests
empty_req_indices: indices not filled by condensation
......
......@@ -2955,7 +2955,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
Args:
kv_cache_config: The KV cache config
kv_cache_raw_tensors: The KV cache buffer of each layer, with
correct size but uninitialized shape.
correct size but uninitialized shape.
Returns:
Dict[str, torch.Tensor]: A map between layer names to their
corresponding memory buffer for KV cache.
......
......@@ -552,7 +552,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
return kv_cache_spec
def _get_slot_mapping_metadata(self, num_reqs,
num_scheduled_tokens_per_req):
num_scheduled_tokens_per_req) -> np.ndarray:
"""
Computes metadata for mapping slots to blocks in the key-value (KV)
cache for a batch of requests.
......@@ -565,15 +565,15 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
Args:
num_reqs (int): Number of requests in the current batch.
num_scheduled_tokens_per_req (int or np.ndarray): Number of tokens
to be scheduled for each request.
to be scheduled for each request.
Returns:
np.ndarray: A 2D array of shape (total_block_len, 3), where each row
contains:
contains:
- kv_cache_start_index (int): The starting index in the KV cache
for the corresponding slice.
for the corresponding slice.
- new_kv_start_index (int): The starting index in the new KV
cache for the corresponding slice.
cache for the corresponding slice.
- slice_len (int): The length of the slice.
"""
slices_start = self.input_batch.num_computed_tokens_cpu[:num_reqs]
......
......@@ -172,10 +172,10 @@ def scatter_mm_placeholders(
Args:
embeds: The multimodal embeddings.
Shape: `(num_embeds, embed_dim)`
Shape: `(num_embeds, embed_dim)`
is_embed: A boolean mask indicating which positions in the placeholder
tokens need to be filled with multimodal embeddings.
Shape: `(num_placeholders, num_embeds)`
tokens need to be filled with multimodal embeddings.
Shape: `(num_placeholders, num_embeds)`
"""
if is_embed is None:
return embeds
......@@ -278,7 +278,7 @@ def bind_kv_cache(
Args:
kv_caches: The allocated kv_caches with layer names as keys.
forward_context: The global forward context containing all Attention
layers with layer names as keys.
layers with layer names as keys.
runner_kv_caches: The kv_cache declared by ModelRunner.
"""
# Bind kv_caches to ModelRunner
......
......@@ -36,8 +36,8 @@ class WorkerBase(WorkerBaseV0):
local_rank: Local device index
rank: Global rank in distributed setup
distributed_init_method: Distributed initialization method
is_driver_worker: Whether this worker handles driver
responsibilities
is_driver_worker: Whether this worker handles driver
responsibilities
"""
# Configuration storage
super().__init__(vllm_config=vllm_config)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment