Add full API docs and improve the UX of navigating them (#17485)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-04 03:42:43 +01:00
parent 46fae69cf0
commit d6484ef3c3
101 changed files with 872 additions and 980 deletions
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -170,9 +170,10 @@ class Worker(WorkerBase):
        Then, it calculate the free memory that can be used for KV cache in
        bytes.

-        .. tip::
-            You may limit the usage of GPU memory
-            by adjusting the `gpu_memory_utilization` parameter.
+        :::{tip}
+        You may limit the usage of GPU memory
+        by adjusting the `gpu_memory_utilization` parameter.
+        :::
        """
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs(
 ) -> None:
    """
    Perform sanity checks for the result of
-    :meth:`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
+    {meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
    """
    assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
        "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
@@ -39,7 +39,7 @@ def scatter_mm_placeholders(
    Scatter the multimodal embeddings into a contiguous tensor that represents
    the placeholder tokens.

-    :class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
+    {class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.

    Args:
        embeds: The multimodal embeddings.
@@ -66,7 +66,7 @@ def gather_mm_placeholders(
    """
    Reconstructs the embeddings from the placeholder tokens.

-    This is the operation of :func:`scatter_mm_placeholders`.
+    This is the operation of {func}`scatter_mm_placeholders`.
    """
    if is_embed is None:
        return placeholders