Migrate docs from Sphinx to MkDocs (#18145)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

Migrate docs from Sphinx to MkDocs (#18145)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
a1fe24d9 · Harry Mellor · GitHub · d0bc2f81 · a1fe24d9 · a1fe24d9
Unverified Commit a1fe24d9 authored May 23, 2025 by Harry Mellor Committed by GitHub May 23, 2025
14 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -175,7 +175,7 @@ def get_vllm_port() -> Optional[int]:
 # The begin-* and end* here are used by the documentation generator
 # to extract the used env vars.

-# begin-env-vars-definition
+# --8<-- [start:env-vars-definition]

 environment_variables: dict[str, Callable[[], Any]] = {

@@ -813,7 +813,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
 }

-# end-env-vars-definition
+# --8<-- [end:env-vars-definition]


 def __getattr__(name: str):

--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -528,12 +528,12 @@ class RayDistributedExecutor(DistributedExecutorBase):
        ray.get(parallel_worker_tasks)

    def _check_ray_cgraph_installation(self):
-        import pkg_resources
+        import importlib.metadata
+
        from packaging import version

        required_version = version.parse("2.43.0")
-        current_version = version.parse(
-            pkg_resources.get_distribution("ray").version)
+        current_version = version.parse(importlib.metadata.version("ray"))
        if current_version < required_version:
            raise ValueError(f"Ray version {required_version} is "
                             f"required, but found {current_version}")

--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -681,9 +681,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
                batch.
            pixel_values: The pixels in each input image.
        
-        :::{seealso}
-        {class}`Blip2ImageInputs`
-        :::
+        Info:
+            [Blip2ImageInputs][]
        """

        if intermediate_tensors is not None:

--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -721,9 +721,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
                batch.
            pixel_values: The pixels in each input image.

-        :::{seealso}
-        {class}`LlavaImageInputs`
-        :::
+        Info:
+            [LlavaImageInputs][]
        """
        if intermediate_tensors is not None:
            inputs_embeds = None

--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -551,9 +551,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
            pixel_values: The pixels in each grid patch for each input image.
            image_sizes: The original `(height, width)` for each input image.

-        :::{seealso}
-        {class}`LlavaNextImageInputs`
-        :::
+        Info:
+            [LlavaNextImageInputs][]
        """
        if intermediate_tensors is not None:
            inputs_embeds = None

--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -559,9 +559,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
                batch.
            pixel_values: The pixels in each input image.

-        :::{seealso}
-        {class}`Mistral3ImagePixelInputs`
-        :::
+        Info:
+            [Mistral3ImagePixelInputs][]
        """
        if intermediate_tensors is not None:
            inputs_embeds = None

--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -11,9 +11,8 @@ MULTIMODAL_REGISTRY = MultiModalRegistry()
 The global {class}`~MultiModalRegistry` is used by model runners to
 dispatch data processing according to the target model.

-:::{seealso}
-{ref}`mm-processing`
-:::
+Info:
+    {ref}`mm-processing`
 """

 __all__ = [

--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -289,9 +289,8 @@ class BaseMultiModalField(ABC):
 @dataclass(frozen=True)
 class MultiModalBatchedField(BaseMultiModalField):
    """
-    :::{seealso}
-    {func}`MultiModalFieldConfig.batched`
-    :::
+    Info:
+        [MultiModalFieldConfig.batched][]
    """

    def build_elems(
@@ -320,10 +319,9 @@ class MultiModalBatchedField(BaseMultiModalField):
 @dataclass(frozen=True)
 class MultiModalFlatField(BaseMultiModalField):
    """
-    :::{seealso}
-    {func}`MultiModalFieldConfig.flat`
-    {func}`MultiModalFieldConfig.flat_from_sizes`
-    :::
+    Info:
+        [MultiModalFieldConfig.flat][]
+        [MultiModalFieldConfig.flat_from_sizes][]
    """
    slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
    dim: int = 0
@@ -363,9 +361,8 @@ class MultiModalFlatField(BaseMultiModalField):
 @dataclass(frozen=True)
 class MultiModalSharedField(BaseMultiModalField):
    """
-    :::{seealso}
-    {func}`MultiModalFieldConfig.shared`
-    :::
+    Info:
+        [MultiModalFieldConfig.shared][]
    """
    batch_size: int

@@ -510,9 +507,8 @@ class MultiModalFieldConfig:
            Element 3: [[C],[C]]
        ```

-        :::{seealso}
-        {func}`MultiModalFieldConfig.flat`
-        :::
+        Info:
+            [MultiModalFieldConfig.flat][]
        """

        if size_per_item.ndim != 1:

--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -214,9 +214,8 @@ class MultiModalRegistry:
        When the model receives multi-modal data, the provided function is
        invoked to transform the data into a dictionary of model inputs.

-        :::{seealso}
-        {ref}`mm-processing`
-        :::
+        Info:
+            {ref}`mm-processing`
        """

        def wrapper(model_cls: N) -> N:
@@ -260,9 +259,8 @@ class MultiModalRegistry:
        """
        Create a multi-modal processor for a specific model and tokenizer.

-        :::{seealso}
-        {ref}`mm-processing`
-        :::
+        Info:
+            {ref}`mm-processing`
        """
        if not model_config.is_multimodal_model:
            raise ValueError(f"{model_config.model} is not a multimodal model")

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1926,9 +1926,8 @@ class _PlaceholderBase:
    We need to explicitly override each dunder method because
    {meth}`__getattr__` is not called when they are accessed.

-    :::{seealso}
-    [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
-    :::
+    Info:
+        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
    """

    def __getattr__(self, key: str) -> Never:

--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -172,10 +172,9 @@ class Worker(WorkerBase):
        Then, it calculate the free memory that can be used for KV cache in
        bytes.

-        :::{tip}
-        You may limit the usage of GPU memory
-        by adjusting the `gpu_memory_utilization` parameter.
-        :::
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
        """
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -201,10 +201,9 @@ class HPUWorker(LocalOrDistributedWorkerBase):
        Then, it calculate the maximum possible number of GPU and CPU blocks
        that can be allocated with the remaining free memory.

-        :::{tip}
-        You may limit the usage of GPU memory
-        by adjusting the `gpu_memory_utilization` parameter.
-        :::
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
        """
        # Profile the memory usage of the model and get the maximum number of
        # cache blocks that can be allocated with the remaining free memory.

--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -234,10 +234,9 @@ class Worker(LocalOrDistributedWorkerBase):
        Then, it calculate the maximum possible number of GPU and CPU blocks
        that can be allocated with the remaining free memory.

-        :::{tip}
-        You may limit the usage of GPU memory
-        by adjusting the `gpu_memory_utilization` parameter.
-        :::
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
        """
        # Profile the memory usage of the model and get the maximum number of
        # cache blocks that can be allocated with the remaining free memory.

--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -93,10 +93,9 @@ class XPUWorker(LoRANotSupportedWorkerBase, Worker):
        Then, it calculate the maximum possible number of GPU and CPU blocks
        that can be allocated with the remaining free memory.

-        :::{tip}
-        You may limit the usage of GPU memory
-        by adjusting the `gpu_memory_utilization` parameter.
-        :::
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
        """
        # Profile the memory usage of the model and get the maximum number of
        # cache blocks that can be allocated with the remaining free memory.