Unverified Commit 6c9fdbf7 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

[Docs] Replace `rst` style double-backtick with `md` single-backtick (#27091)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 483ea646
......@@ -70,7 +70,7 @@ class BasevLLMParameter(Parameter):
# NOTE(@ksayers) some models such as mamba_mixer2 override the
# weight loader to support custom loading. In the future, model-specific
# weight loading should be implemented via Model.load_weights. In the
# meantime, support deleting and overriding `weight_loader`` attribute
# meantime, support deleting and overriding `weight_loader` attribute
if self._weight_loader is None:
raise AttributeError(
f"{self.__class__.__name__} weight_loader attribute has been deleted"
......
......@@ -332,8 +332,8 @@ class PromptInsertion(PromptUpdate):
Example:
For each image, insert a number of ``<image>`` feature placeholders
equal to the feature size of the vision encoder after the ``<s>`` token:
For each image, insert a number of `<image>` feature placeholders
equal to the feature size of the vision encoder after the `<s>` token:
```python
PromptInsertion(
......@@ -353,7 +353,7 @@ class PromptInsertion(PromptUpdate):
)
```
Insert these tokens after a prefix ``Images:``:
Insert these tokens after a prefix `Images:`:
```python
PromptInsertion(
......@@ -401,8 +401,8 @@ class PromptReplacement(PromptUpdate):
Example:
For each image, replace one ``<image>`` input placeholder in the prompt
with a number of ``<image>`` feature placeholders
For each image, replace one `<image>` input placeholder in the prompt
with a number of `<image>` feature placeholders
equal to the feature size of the vision encoder:
```python
......@@ -413,8 +413,8 @@ class PromptReplacement(PromptUpdate):
)
```
As above, but further pad the feature placeholders with ``<image_bos>``
and `<image_eos>``, which are not supposed to be passed to the vision
As above, but further pad the feature placeholders with `<image_bos>`
and `<image_eos>`, which are not supposed to be passed to the vision
encoder:
```python
......
......@@ -307,7 +307,7 @@ class MultiModalRegistry:
"""
Create dummy data for profiling the memory usage of a model.
The model is identified by ``model_config``.
The model is identified by `model_config`.
"""
processor = self.create_processor(model_config, cache=cache)
profiler: MultiModalProfiler = MultiModalProfiler(processor)
......@@ -340,7 +340,7 @@ class MultiModalRegistry:
"""
Create dummy data for profiling the memory usage of a model.
The model is identified by ``model_config``.
The model is identified by `model_config`.
"""
processor = self.create_processor(model_config, cache=cache)
profiler: MultiModalProfiler = MultiModalProfiler(processor)
......
......@@ -75,7 +75,7 @@ _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
"0x74bd": "AMD_Instinct_MI300X_HF",
}
# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`
if "HIP_VISIBLE_DEVICES" in os.environ:
val = os.environ["HIP_VISIBLE_DEVICES"]
if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
......
......@@ -168,7 +168,7 @@ class XPUPlatform(Platform):
parallel_config.distributed_executor_backend = "uni"
elif parallel_config.distributed_executor_backend == "mp":
# FIXME(kunshang):
# spawn needs calling `if __name__ == '__main__':``
# spawn needs calling `if __name__ == '__main__':`
# fork is not supported for xpu start new process.
if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
......
......@@ -306,10 +306,10 @@ class SamplingParams(
)
def __post_init__(self) -> None:
# how we deal with `best_of``:
# if `best_of`` is not set, we default to `n`;
# if `best_of`` is set, we set `n`` to `best_of`,
# and set `_real_n`` to the original `n`.
# how we deal with `best_of`:
# if `best_of` is not set, we default to `n`;
# if `best_of` is set, we set `n` to `best_of`,
# and set `_real_n` to the original `n`.
# when we return the result, we will check
# if we need to return `n` or `_real_n` results
if self.best_of:
......
......@@ -21,7 +21,7 @@ from vllm.utils import cdiv, has_deep_gemm
@functools.cache
def is_deep_gemm_supported() -> bool:
"""Return ``True`` if DeepGEMM is supported on the current platform.
"""Return `True` if DeepGEMM is supported on the current platform.
Currently, only Hopper and Blackwell GPUs are supported.
"""
is_supported_arch = current_platform.is_cuda() and (
......@@ -33,7 +33,7 @@ def is_deep_gemm_supported() -> bool:
@functools.cache
def is_deep_gemm_e8m0_used() -> bool:
"""Return ``True`` if vLLM is configured to use DeepGEMM "
"""Return `True` if vLLM is configured to use DeepGEMM "
"E8M0 scale on a Hopper or Blackwell-class GPU.
"""
if not is_deep_gemm_supported():
......@@ -311,9 +311,9 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor):
"""Return a global difference metric for unit tests.
DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element
error, causing ``torch.testing.assert_close`` to fail. Instead of checking
error, causing `torch.testing.assert_close` to fail. Instead of checking
every element, we compute a cosine-style similarity over the whole tensor
and report ``1 - sim``. Once kernel accuracy improves this helper can be
and report `1 - sim`. Once kernel accuracy improves this helper can be
removed.
"""
......
......@@ -34,7 +34,7 @@ FLASHINFER_CUBINS_REPOSITORY = os.environ.get(
@functools.cache
def has_flashinfer() -> bool:
"""Return ``True`` if FlashInfer is available."""
"""Return `True` if FlashInfer is available."""
# Use find_spec to check if the module exists without importing it
# This avoids potential CUDA initialization side effects
if importlib.util.find_spec("flashinfer") is None:
......@@ -114,13 +114,13 @@ autotune = _lazy_import_wrapper(
@functools.cache
def has_flashinfer_comm() -> bool:
"""Return ``True`` if FlashInfer comm module is available."""
"""Return `True` if FlashInfer comm module is available."""
return has_flashinfer() and importlib.util.find_spec("flashinfer.comm") is not None
@functools.cache
def has_flashinfer_all2all() -> bool:
"""Return ``True`` if FlashInfer mnnvl all2all is available."""
"""Return `True` if FlashInfer mnnvl all2all is available."""
if not has_flashinfer_comm():
return False
......@@ -141,7 +141,7 @@ def has_flashinfer_all2all() -> bool:
@functools.cache
def has_flashinfer_moe() -> bool:
"""Return ``True`` if FlashInfer MoE module is available."""
"""Return `True` if FlashInfer MoE module is available."""
return (
has_flashinfer()
and importlib.util.find_spec("flashinfer.fused_moe") is not None
......@@ -150,7 +150,7 @@ def has_flashinfer_moe() -> bool:
@functools.cache
def has_flashinfer_cutlass_fused_moe() -> bool:
"""Return ``True`` if FlashInfer CUTLASS fused MoE is available."""
"""Return `True` if FlashInfer CUTLASS fused MoE is available."""
if not has_flashinfer_moe():
return False
......@@ -171,7 +171,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
@functools.cache
def has_nvidia_artifactory() -> bool:
"""Return ``True`` if NVIDIA's artifactory is accessible.
"""Return `True` if NVIDIA's artifactory is accessible.
This checks connectivity to the kernel inference library artifactory
which is required for downloading certain cubin kernels like TRTLLM FHMA.
......@@ -218,9 +218,9 @@ def _force_use_trtllm_attention(env_value: bool | None) -> bool | None:
def force_use_trtllm_attention() -> bool | None:
"""
Return ``None`` if VLLM_USE_TRTLLM_ATTENTION is not set,
return ``True`` if TRTLLM attention is forced to be used,
return ``False`` if TRTLLM attention is forced to be not used.
Return `None` if VLLM_USE_TRTLLM_ATTENTION is not set,
return `True` if TRTLLM attention is forced to be used,
return `False` if TRTLLM attention is forced to be not used.
"""
return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION)
......@@ -244,7 +244,7 @@ def use_trtllm_attention(
has_sinks: bool = False,
has_spec: bool = False,
) -> bool:
"""Return ``True`` if TRTLLM attention is used."""
"""Return `True` if TRTLLM attention is used."""
force_use_trtllm = force_use_trtllm_attention()
# Environment variable is set to 0 - respect it
......
......@@ -26,17 +26,17 @@ from vllm.v1.kv_cache_interface import (
from vllm.v1.request import Request
# BlockHash represents the hash of a single KV-cache block used for
# prefix caching. Treating it as a distinct type from ``bytes`` helps
# prefix caching. Treating it as a distinct type from `bytes` helps
# catch accidental misuse when passing around raw byte strings.
BlockHash = NewType("BlockHash", bytes)
# ``BlockHashWithGroupId`` combines a ``BlockHash`` with its KV cache group ID.
# `BlockHashWithGroupId` combines a `BlockHash` with its KV cache group ID.
# It is represented as raw bytes for compactness and efficiency. The helper
# functions below pack/unpack the ``BlockHash`` and group id into/from the key.
# functions below pack/unpack the `BlockHash` and group id into/from the key.
BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes)
# ExternalBlockHash is used for reproducible prefix-cache block hashing.
# It's a union of ``bytes`` and ``int`` to keep backward compatibility
# It's a union of `bytes` and `int` to keep backward compatibility
# after we default block hashing to use sha256 bytes.
ExternalBlockHash: TypeAlias = bytes | int
......@@ -44,7 +44,7 @@ ExternalBlockHash: TypeAlias = bytes | int
def make_block_hash_with_group_id(
block_hash: BlockHash, group_id: int
) -> BlockHashWithGroupId:
"""Pack a ``BlockHash`` and group id into a ``BlockHashWithGroupId``.
"""Pack a `BlockHash` and group id into a `BlockHashWithGroupId`.
The group id is encoded using 4 bytes in big-endian order and appended to
the block hash bytes. This representation avoids creating tuples while
......@@ -54,12 +54,12 @@ def make_block_hash_with_group_id(
def get_block_hash(key: BlockHashWithGroupId) -> BlockHash:
"""Extract the ``BlockHash`` from a ``BlockHashWithGroupId``."""
"""Extract the `BlockHash` from a `BlockHashWithGroupId`."""
return BlockHash(key[:-4])
def get_group_id(key: BlockHashWithGroupId) -> int:
"""Extract the group id from a ``BlockHashWithGroupId``."""
"""Extract the group id from a `BlockHashWithGroupId`."""
return int.from_bytes(key[-4:], "big", signed=False)
......
......@@ -128,7 +128,7 @@ class CPUWorker(Worker):
"Please try to bind threads manually."
)
# Get CPUs on NUMA node `allowed_numa_nodes[local_rank]``
# Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`
selected_numa_node = allowed_numa_nodes[self.local_rank] # type: ignore
logical_cpu_list = [
x for x in logical_cpu_list if x.numa_node == selected_numa_node
......
......@@ -182,8 +182,8 @@ class TPUWorker:
if isinstance(layer_spec, AttentionSpec):
dtype = layer_spec.dtype
# Use an empty tensor instead of `None`` to force Dynamo to pass
# it by reference, rather by specializing on the value ``None``.
# Use an empty tensor instead of `None` to force Dynamo to pass
# it by reference, rather by specializing on the value `None`.
tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device)
kv_caches[layer_name] = tpu_kv_cache
else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment