Add full API docs and improve the UX of navigating them (#17485)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-04 03:42:43 +01:00
parent 46fae69cf0
commit d6484ef3c3
101 changed files with 872 additions and 980 deletions
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -475,7 +475,7 @@ class _AsyncLLMEngine(LLMEngine):
            *,
            inputs: Optional[PromptType] = None,  # DEPRECATED
    ) -> None:
-        """Async version of :meth:`add_request`."""
+        """Async version of {meth}`add_request`."""
        if inputs is not None:
            prompt = inputs
        assert prompt is not None and params is not None
@@ -582,20 +582,20 @@ async def build_guided_decoding_logits_processor_async(


 class AsyncLLMEngine(EngineClient):
-    """An asynchronous wrapper for :class:`LLMEngine`.
+    """An asynchronous wrapper for {class}`LLMEngine`.

-    This class is used to wrap the :class:`LLMEngine` class to make it
+    This class is used to wrap the {class}`LLMEngine` class to make it
    asynchronous. It uses asyncio to create a background loop that keeps
-    processing incoming requests. The :class:`LLMEngine` is kicked by the
+    processing incoming requests. The {class}`LLMEngine` is kicked by the
    generate method when there are requests in the waiting queue. The generate
-    method yields the outputs from the :class:`LLMEngine` to the caller.
+    method yields the outputs from the {class}`LLMEngine` to the caller.

    Args:
        log_requests: Whether to log the requests.
        start_engine_loop: If True, the background task to run the engine
            will be automatically started in the generate call.
-        *args: Arguments for :class:`LLMEngine`.
-        **kwargs: Arguments for :class:`LLMEngine`.
+        *args: Arguments for {class}`LLMEngine`.
+        **kwargs: Arguments for {class}`LLMEngine`.
    """

    _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
@@ -985,7 +985,7 @@ class AsyncLLMEngine(EngineClient):
        from the LLMEngine to the caller.

        Args:
-            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
                for more details about the format of each input.
            sampling_params: The sampling parameters of the request.
            request_id: The unique id of the request.
@@ -1003,7 +1003,7 @@ class AsyncLLMEngine(EngineClient):
        Details:
            - If the engine is not running, start the background loop,
              which iteratively invokes
-              :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
+              {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
              to process the waiting requests.
            - Add the request to the engine's `RequestTracker`.
              On the next background loop, this request will be sent to
@@ -1075,7 +1075,7 @@ class AsyncLLMEngine(EngineClient):
        from the LLMEngine to the caller.

        Args:
-            prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
                for more details about the format of each input.
            pooling_params: The pooling parameters of the request.
            request_id: The unique id of the request.
@@ -1089,46 +1089,48 @@ class AsyncLLMEngine(EngineClient):
            for the request.

        Details:
-            - If the engine is not running, start the background loop,
-              which iteratively invokes
-              :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
-              to process the waiting requests.
-            - Add the request to the engine's `RequestTracker`.
-              On the next background loop, this request will be sent to
-              the underlying engine.
-              Also, a corresponding `AsyncStream` will be created.
-            - Wait for the request outputs from `AsyncStream` and yield them.
+        - If the engine is not running, start the background loop,
+            which iteratively invokes
+            {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
+            to process the waiting requests.
+        - Add the request to the engine's `RequestTracker`.
+            On the next background loop, this request will be sent to
+            the underlying engine.
+            Also, a corresponding `AsyncStream` will be created.
+        - Wait for the request outputs from `AsyncStream` and yield them.

        Example:
-            >>> # Please refer to entrypoints/api_server.py for
-            >>> # the complete example.
-            >>>
-            >>> # initialize the engine and the example input
-            >>> # note that engine_args here is AsyncEngineArgs instance
-            >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
-            >>> example_input = {
-            >>>     "input": "What is LLM?",
-            >>>     "request_id": 0,
-            >>> }
-            >>>
-            >>> # start the generation
-            >>> results_generator = engine.encode(
-            >>>    example_input["input"],
-            >>>    PoolingParams(),
-            >>>    example_input["request_id"])
-            >>>
-            >>> # get the results
-            >>> final_output = None
-            >>> async for request_output in results_generator:
-            >>>     if await request.is_disconnected():
-            >>>         # Abort the request if the client disconnects.
-            >>>         await engine.abort(request_id)
-            >>>         # Return or raise an error
-            >>>         ...
-            >>>     final_output = request_output
-            >>>
-            >>> # Process and return the final output
-            >>> ...
+        ```
+        # Please refer to entrypoints/api_server.py for
+        # the complete example.
+    
+        # initialize the engine and the example input
+        # note that engine_args here is AsyncEngineArgs instance
+        engine = AsyncLLMEngine.from_engine_args(engine_args)
+        example_input = {
+            "input": "What is LLM?",
+            "request_id": 0,
+        }
+    
+        # start the generation
+        results_generator = engine.encode(
+        example_input["input"],
+        PoolingParams(),
+        example_input["request_id"])
+    
+        # get the results
+        final_output = None
+        async for request_output in results_generator:
+            if await request.is_disconnected():
+                # Abort the request if the client disconnects.
+                await engine.abort(request_id)
+                # Return or raise an error
+                ...
+            final_output = request_output
+    
+        # Process and return the final output
+        ...
+        ```
        """
        try:
            async for output in await self.add_request(