Unverified Commit 6c9fdbf7 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

[Docs] Replace `rst` style double-backtick with `md` single-backtick (#27091)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 483ea646
...@@ -70,7 +70,7 @@ class BasevLLMParameter(Parameter): ...@@ -70,7 +70,7 @@ class BasevLLMParameter(Parameter):
# NOTE(@ksayers) some models such as mamba_mixer2 override the # NOTE(@ksayers) some models such as mamba_mixer2 override the
# weight loader to support custom loading. In the future, model-specific # weight loader to support custom loading. In the future, model-specific
# weight loading should be implemented via Model.load_weights. In the # weight loading should be implemented via Model.load_weights. In the
# meantime, support deleting and overriding `weight_loader`` attribute # meantime, support deleting and overriding `weight_loader` attribute
if self._weight_loader is None: if self._weight_loader is None:
raise AttributeError( raise AttributeError(
f"{self.__class__.__name__} weight_loader attribute has been deleted" f"{self.__class__.__name__} weight_loader attribute has been deleted"
......
...@@ -332,8 +332,8 @@ class PromptInsertion(PromptUpdate): ...@@ -332,8 +332,8 @@ class PromptInsertion(PromptUpdate):
Example: Example:
For each image, insert a number of ``<image>`` feature placeholders For each image, insert a number of `<image>` feature placeholders
equal to the feature size of the vision encoder after the ``<s>`` token: equal to the feature size of the vision encoder after the `<s>` token:
```python ```python
PromptInsertion( PromptInsertion(
...@@ -353,7 +353,7 @@ class PromptInsertion(PromptUpdate): ...@@ -353,7 +353,7 @@ class PromptInsertion(PromptUpdate):
) )
``` ```
Insert these tokens after a prefix ``Images:``: Insert these tokens after a prefix `Images:`:
```python ```python
PromptInsertion( PromptInsertion(
...@@ -401,8 +401,8 @@ class PromptReplacement(PromptUpdate): ...@@ -401,8 +401,8 @@ class PromptReplacement(PromptUpdate):
Example: Example:
For each image, replace one ``<image>`` input placeholder in the prompt For each image, replace one `<image>` input placeholder in the prompt
with a number of ``<image>`` feature placeholders with a number of `<image>` feature placeholders
equal to the feature size of the vision encoder: equal to the feature size of the vision encoder:
```python ```python
...@@ -413,8 +413,8 @@ class PromptReplacement(PromptUpdate): ...@@ -413,8 +413,8 @@ class PromptReplacement(PromptUpdate):
) )
``` ```
As above, but further pad the feature placeholders with ``<image_bos>`` As above, but further pad the feature placeholders with `<image_bos>`
and `<image_eos>``, which are not supposed to be passed to the vision and `<image_eos>`, which are not supposed to be passed to the vision
encoder: encoder:
```python ```python
......
...@@ -307,7 +307,7 @@ class MultiModalRegistry: ...@@ -307,7 +307,7 @@ class MultiModalRegistry:
""" """
Create dummy data for profiling the memory usage of a model. Create dummy data for profiling the memory usage of a model.
The model is identified by ``model_config``. The model is identified by `model_config`.
""" """
processor = self.create_processor(model_config, cache=cache) processor = self.create_processor(model_config, cache=cache)
profiler: MultiModalProfiler = MultiModalProfiler(processor) profiler: MultiModalProfiler = MultiModalProfiler(processor)
...@@ -340,7 +340,7 @@ class MultiModalRegistry: ...@@ -340,7 +340,7 @@ class MultiModalRegistry:
""" """
Create dummy data for profiling the memory usage of a model. Create dummy data for profiling the memory usage of a model.
The model is identified by ``model_config``. The model is identified by `model_config`.
""" """
processor = self.create_processor(model_config, cache=cache) processor = self.create_processor(model_config, cache=cache)
profiler: MultiModalProfiler = MultiModalProfiler(processor) profiler: MultiModalProfiler = MultiModalProfiler(processor)
......
...@@ -75,7 +75,7 @@ _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = { ...@@ -75,7 +75,7 @@ _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = {
"0x74bd": "AMD_Instinct_MI300X_HF", "0x74bd": "AMD_Instinct_MI300X_HF",
} }
# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`` # Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`
if "HIP_VISIBLE_DEVICES" in os.environ: if "HIP_VISIBLE_DEVICES" in os.environ:
val = os.environ["HIP_VISIBLE_DEVICES"] val = os.environ["HIP_VISIBLE_DEVICES"]
if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None): if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
......
...@@ -168,7 +168,7 @@ class XPUPlatform(Platform): ...@@ -168,7 +168,7 @@ class XPUPlatform(Platform):
parallel_config.distributed_executor_backend = "uni" parallel_config.distributed_executor_backend = "uni"
elif parallel_config.distributed_executor_backend == "mp": elif parallel_config.distributed_executor_backend == "mp":
# FIXME(kunshang): # FIXME(kunshang):
# spawn needs calling `if __name__ == '__main__':`` # spawn needs calling `if __name__ == '__main__':`
# fork is not supported for xpu start new process. # fork is not supported for xpu start new process.
if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn": if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
......
...@@ -306,10 +306,10 @@ class SamplingParams( ...@@ -306,10 +306,10 @@ class SamplingParams(
) )
def __post_init__(self) -> None: def __post_init__(self) -> None:
# how we deal with `best_of``: # how we deal with `best_of`:
# if `best_of`` is not set, we default to `n`; # if `best_of` is not set, we default to `n`;
# if `best_of`` is set, we set `n`` to `best_of`, # if `best_of` is set, we set `n` to `best_of`,
# and set `_real_n`` to the original `n`. # and set `_real_n` to the original `n`.
# when we return the result, we will check # when we return the result, we will check
# if we need to return `n` or `_real_n` results # if we need to return `n` or `_real_n` results
if self.best_of: if self.best_of:
......
...@@ -21,7 +21,7 @@ from vllm.utils import cdiv, has_deep_gemm ...@@ -21,7 +21,7 @@ from vllm.utils import cdiv, has_deep_gemm
@functools.cache @functools.cache
def is_deep_gemm_supported() -> bool: def is_deep_gemm_supported() -> bool:
"""Return ``True`` if DeepGEMM is supported on the current platform. """Return `True` if DeepGEMM is supported on the current platform.
Currently, only Hopper and Blackwell GPUs are supported. Currently, only Hopper and Blackwell GPUs are supported.
""" """
is_supported_arch = current_platform.is_cuda() and ( is_supported_arch = current_platform.is_cuda() and (
...@@ -33,7 +33,7 @@ def is_deep_gemm_supported() -> bool: ...@@ -33,7 +33,7 @@ def is_deep_gemm_supported() -> bool:
@functools.cache @functools.cache
def is_deep_gemm_e8m0_used() -> bool: def is_deep_gemm_e8m0_used() -> bool:
"""Return ``True`` if vLLM is configured to use DeepGEMM " """Return `True` if vLLM is configured to use DeepGEMM "
"E8M0 scale on a Hopper or Blackwell-class GPU. "E8M0 scale on a Hopper or Blackwell-class GPU.
""" """
if not is_deep_gemm_supported(): if not is_deep_gemm_supported():
...@@ -311,9 +311,9 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor): ...@@ -311,9 +311,9 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor):
"""Return a global difference metric for unit tests. """Return a global difference metric for unit tests.
DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element
error, causing ``torch.testing.assert_close`` to fail. Instead of checking error, causing `torch.testing.assert_close` to fail. Instead of checking
every element, we compute a cosine-style similarity over the whole tensor every element, we compute a cosine-style similarity over the whole tensor
and report ``1 - sim``. Once kernel accuracy improves this helper can be and report `1 - sim`. Once kernel accuracy improves this helper can be
removed. removed.
""" """
......
...@@ -34,7 +34,7 @@ FLASHINFER_CUBINS_REPOSITORY = os.environ.get( ...@@ -34,7 +34,7 @@ FLASHINFER_CUBINS_REPOSITORY = os.environ.get(
@functools.cache @functools.cache
def has_flashinfer() -> bool: def has_flashinfer() -> bool:
"""Return ``True`` if FlashInfer is available.""" """Return `True` if FlashInfer is available."""
# Use find_spec to check if the module exists without importing it # Use find_spec to check if the module exists without importing it
# This avoids potential CUDA initialization side effects # This avoids potential CUDA initialization side effects
if importlib.util.find_spec("flashinfer") is None: if importlib.util.find_spec("flashinfer") is None:
...@@ -114,13 +114,13 @@ autotune = _lazy_import_wrapper( ...@@ -114,13 +114,13 @@ autotune = _lazy_import_wrapper(
@functools.cache @functools.cache
def has_flashinfer_comm() -> bool: def has_flashinfer_comm() -> bool:
"""Return ``True`` if FlashInfer comm module is available.""" """Return `True` if FlashInfer comm module is available."""
return has_flashinfer() and importlib.util.find_spec("flashinfer.comm") is not None return has_flashinfer() and importlib.util.find_spec("flashinfer.comm") is not None
@functools.cache @functools.cache
def has_flashinfer_all2all() -> bool: def has_flashinfer_all2all() -> bool:
"""Return ``True`` if FlashInfer mnnvl all2all is available.""" """Return `True` if FlashInfer mnnvl all2all is available."""
if not has_flashinfer_comm(): if not has_flashinfer_comm():
return False return False
...@@ -141,7 +141,7 @@ def has_flashinfer_all2all() -> bool: ...@@ -141,7 +141,7 @@ def has_flashinfer_all2all() -> bool:
@functools.cache @functools.cache
def has_flashinfer_moe() -> bool: def has_flashinfer_moe() -> bool:
"""Return ``True`` if FlashInfer MoE module is available.""" """Return `True` if FlashInfer MoE module is available."""
return ( return (
has_flashinfer() has_flashinfer()
and importlib.util.find_spec("flashinfer.fused_moe") is not None and importlib.util.find_spec("flashinfer.fused_moe") is not None
...@@ -150,7 +150,7 @@ def has_flashinfer_moe() -> bool: ...@@ -150,7 +150,7 @@ def has_flashinfer_moe() -> bool:
@functools.cache @functools.cache
def has_flashinfer_cutlass_fused_moe() -> bool: def has_flashinfer_cutlass_fused_moe() -> bool:
"""Return ``True`` if FlashInfer CUTLASS fused MoE is available.""" """Return `True` if FlashInfer CUTLASS fused MoE is available."""
if not has_flashinfer_moe(): if not has_flashinfer_moe():
return False return False
...@@ -171,7 +171,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool: ...@@ -171,7 +171,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
@functools.cache @functools.cache
def has_nvidia_artifactory() -> bool: def has_nvidia_artifactory() -> bool:
"""Return ``True`` if NVIDIA's artifactory is accessible. """Return `True` if NVIDIA's artifactory is accessible.
This checks connectivity to the kernel inference library artifactory This checks connectivity to the kernel inference library artifactory
which is required for downloading certain cubin kernels like TRTLLM FHMA. which is required for downloading certain cubin kernels like TRTLLM FHMA.
...@@ -218,9 +218,9 @@ def _force_use_trtllm_attention(env_value: bool | None) -> bool | None: ...@@ -218,9 +218,9 @@ def _force_use_trtllm_attention(env_value: bool | None) -> bool | None:
def force_use_trtllm_attention() -> bool | None: def force_use_trtllm_attention() -> bool | None:
""" """
Return ``None`` if VLLM_USE_TRTLLM_ATTENTION is not set, Return `None` if VLLM_USE_TRTLLM_ATTENTION is not set,
return ``True`` if TRTLLM attention is forced to be used, return `True` if TRTLLM attention is forced to be used,
return ``False`` if TRTLLM attention is forced to be not used. return `False` if TRTLLM attention is forced to be not used.
""" """
return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION) return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION)
...@@ -244,7 +244,7 @@ def use_trtllm_attention( ...@@ -244,7 +244,7 @@ def use_trtllm_attention(
has_sinks: bool = False, has_sinks: bool = False,
has_spec: bool = False, has_spec: bool = False,
) -> bool: ) -> bool:
"""Return ``True`` if TRTLLM attention is used.""" """Return `True` if TRTLLM attention is used."""
force_use_trtllm = force_use_trtllm_attention() force_use_trtllm = force_use_trtllm_attention()
# Environment variable is set to 0 - respect it # Environment variable is set to 0 - respect it
......
...@@ -26,17 +26,17 @@ from vllm.v1.kv_cache_interface import ( ...@@ -26,17 +26,17 @@ from vllm.v1.kv_cache_interface import (
from vllm.v1.request import Request from vllm.v1.request import Request
# BlockHash represents the hash of a single KV-cache block used for # BlockHash represents the hash of a single KV-cache block used for
# prefix caching. Treating it as a distinct type from ``bytes`` helps # prefix caching. Treating it as a distinct type from `bytes` helps
# catch accidental misuse when passing around raw byte strings. # catch accidental misuse when passing around raw byte strings.
BlockHash = NewType("BlockHash", bytes) BlockHash = NewType("BlockHash", bytes)
# ``BlockHashWithGroupId`` combines a ``BlockHash`` with its KV cache group ID. # `BlockHashWithGroupId` combines a `BlockHash` with its KV cache group ID.
# It is represented as raw bytes for compactness and efficiency. The helper # It is represented as raw bytes for compactness and efficiency. The helper
# functions below pack/unpack the ``BlockHash`` and group id into/from the key. # functions below pack/unpack the `BlockHash` and group id into/from the key.
BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes) BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes)
# ExternalBlockHash is used for reproducible prefix-cache block hashing. # ExternalBlockHash is used for reproducible prefix-cache block hashing.
# It's a union of ``bytes`` and ``int`` to keep backward compatibility # It's a union of `bytes` and `int` to keep backward compatibility
# after we default block hashing to use sha256 bytes. # after we default block hashing to use sha256 bytes.
ExternalBlockHash: TypeAlias = bytes | int ExternalBlockHash: TypeAlias = bytes | int
...@@ -44,7 +44,7 @@ ExternalBlockHash: TypeAlias = bytes | int ...@@ -44,7 +44,7 @@ ExternalBlockHash: TypeAlias = bytes | int
def make_block_hash_with_group_id( def make_block_hash_with_group_id(
block_hash: BlockHash, group_id: int block_hash: BlockHash, group_id: int
) -> BlockHashWithGroupId: ) -> BlockHashWithGroupId:
"""Pack a ``BlockHash`` and group id into a ``BlockHashWithGroupId``. """Pack a `BlockHash` and group id into a `BlockHashWithGroupId`.
The group id is encoded using 4 bytes in big-endian order and appended to The group id is encoded using 4 bytes in big-endian order and appended to
the block hash bytes. This representation avoids creating tuples while the block hash bytes. This representation avoids creating tuples while
...@@ -54,12 +54,12 @@ def make_block_hash_with_group_id( ...@@ -54,12 +54,12 @@ def make_block_hash_with_group_id(
def get_block_hash(key: BlockHashWithGroupId) -> BlockHash: def get_block_hash(key: BlockHashWithGroupId) -> BlockHash:
"""Extract the ``BlockHash`` from a ``BlockHashWithGroupId``.""" """Extract the `BlockHash` from a `BlockHashWithGroupId`."""
return BlockHash(key[:-4]) return BlockHash(key[:-4])
def get_group_id(key: BlockHashWithGroupId) -> int: def get_group_id(key: BlockHashWithGroupId) -> int:
"""Extract the group id from a ``BlockHashWithGroupId``.""" """Extract the group id from a `BlockHashWithGroupId`."""
return int.from_bytes(key[-4:], "big", signed=False) return int.from_bytes(key[-4:], "big", signed=False)
......
...@@ -128,7 +128,7 @@ class CPUWorker(Worker): ...@@ -128,7 +128,7 @@ class CPUWorker(Worker):
"Please try to bind threads manually." "Please try to bind threads manually."
) )
# Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`` # Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`
selected_numa_node = allowed_numa_nodes[self.local_rank] # type: ignore selected_numa_node = allowed_numa_nodes[self.local_rank] # type: ignore
logical_cpu_list = [ logical_cpu_list = [
x for x in logical_cpu_list if x.numa_node == selected_numa_node x for x in logical_cpu_list if x.numa_node == selected_numa_node
......
...@@ -182,8 +182,8 @@ class TPUWorker: ...@@ -182,8 +182,8 @@ class TPUWorker:
if isinstance(layer_spec, AttentionSpec): if isinstance(layer_spec, AttentionSpec):
dtype = layer_spec.dtype dtype = layer_spec.dtype
# Use an empty tensor instead of `None`` to force Dynamo to pass # Use an empty tensor instead of `None` to force Dynamo to pass
# it by reference, rather by specializing on the value ``None``. # it by reference, rather by specializing on the value `None`.
tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device) tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device)
kv_caches[layer_name] = tpu_kv_cache kv_caches[layer_name] = tpu_kv_cache
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment