Add full API docs and improve the UX of navigating them (#17485)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -115,7 +115,7 @@ class LLM:
|
||||
to eager mode. Additionally for encoder-decoder models, if the
|
||||
sequence length of the encoder input is larger than this, we fall
|
||||
back to the eager mode.
|
||||
disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
|
||||
disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig`
|
||||
disable_async_output_proc: Disable async output processing.
|
||||
This may result in lower performance.
|
||||
hf_token: The token to use as HTTP bearer authorization for remote files
|
||||
@@ -127,12 +127,13 @@ class LLM:
|
||||
compilation_config: Either an integer or a dictionary. If it is an
|
||||
integer, it is used as the level of compilation optimization. If it
|
||||
is a dictionary, it can specify the full compilation configuration.
|
||||
**kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
|
||||
:ref:`engine-args`)
|
||||
**kwargs: Arguments for {class}`~vllm.EngineArgs`. (See
|
||||
{ref}`engine-args`)
|
||||
|
||||
Note:
|
||||
This class is intended to be used for offline inference. For online
|
||||
serving, use the :class:`~vllm.AsyncLLMEngine` class instead.
|
||||
:::{note}
|
||||
This class is intended to be used for offline inference. For online
|
||||
serving, use the {class}`~vllm.AsyncLLMEngine` class instead.
|
||||
:::
|
||||
"""
|
||||
|
||||
DEPRECATE_LEGACY: ClassVar[bool] = True
|
||||
@@ -141,7 +142,7 @@ class LLM:
|
||||
DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
|
||||
"""
|
||||
A flag to toggle whether to deprecate positional arguments in
|
||||
:meth:`LLM.__init__`.
|
||||
{meth}`LLM.__init__`.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
@@ -398,7 +399,7 @@ class LLM:
|
||||
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See :class:`~vllm.inputs.PromptType`
|
||||
for batch inference. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each prompts.
|
||||
sampling_params: The sampling parameters for text generation. If
|
||||
None, we use the default sampling parameters.
|
||||
@@ -413,13 +414,14 @@ class LLM:
|
||||
Only applicable when priority scheduling policy is enabled.
|
||||
|
||||
Returns:
|
||||
A list of ``RequestOutput`` objects containing the
|
||||
A list of `RequestOutput` objects containing the
|
||||
generated completions in the same order as the input prompts.
|
||||
|
||||
Note:
|
||||
Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
|
||||
considered legacy and may be deprecated in the future. You should
|
||||
instead pass them via the ``inputs`` parameter.
|
||||
:::{note}
|
||||
Using `prompts` and `prompt_token_ids` as keyword parameters is
|
||||
considered legacy and may be deprecated in the future. You should
|
||||
instead pass them via the `inputs` parameter.
|
||||
:::
|
||||
"""
|
||||
runner_type = self.llm_engine.model_config.runner_type
|
||||
if runner_type not in ["generate", "transcription"]:
|
||||
@@ -488,16 +490,17 @@ class LLM:
|
||||
`self` argument, in addition to the arguments passed in `args`
|
||||
and `kwargs`. The `self` argument will be the worker object.
|
||||
timeout: Maximum time in seconds to wait for execution. Raises a
|
||||
:exc:`TimeoutError` on timeout. `None` means wait indefinitely.
|
||||
{exc}`TimeoutError` on timeout. `None` means wait indefinitely.
|
||||
args: Positional arguments to pass to the worker method.
|
||||
kwargs: Keyword arguments to pass to the worker method.
|
||||
|
||||
Returns:
|
||||
A list containing the results from each worker.
|
||||
|
||||
Note:
|
||||
It is recommended to use this API to only pass control messages,
|
||||
and set up data-plane communication to pass data.
|
||||
|
||||
:::{note}
|
||||
It is recommended to use this API to only pass control messages,
|
||||
and set up data-plane communication to pass data.
|
||||
:::
|
||||
"""
|
||||
|
||||
return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
|
||||
@@ -664,7 +667,7 @@ class LLM:
|
||||
Generate responses for a chat conversation.
|
||||
|
||||
The chat conversation is converted into a text prompt using the
|
||||
tokenizer and calls the :meth:`generate` method to generate the
|
||||
tokenizer and calls the {meth}`generate` method to generate the
|
||||
responses.
|
||||
|
||||
Multi-modal inputs can be passed in the same way you would pass them
|
||||
@@ -903,7 +906,7 @@ class LLM:
|
||||
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See :class:`~vllm.inputs.PromptType`
|
||||
for batch inference. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each prompts.
|
||||
pooling_params: The pooling parameters for pooling. If None, we
|
||||
use the default pooling parameters.
|
||||
@@ -913,13 +916,14 @@ class LLM:
|
||||
generation, if any.
|
||||
|
||||
Returns:
|
||||
A list of ``PoolingRequestOutput`` objects containing the
|
||||
A list of `PoolingRequestOutput` objects containing the
|
||||
pooled hidden states in the same order as the input prompts.
|
||||
|
||||
Note:
|
||||
Using ``prompts`` and ``prompt_token_ids`` as keyword parameters is
|
||||
considered legacy and may be deprecated in the future. You should
|
||||
instead pass them via the ``inputs`` parameter.
|
||||
:::{note}
|
||||
Using `prompts` and `prompt_token_ids` as keyword parameters is
|
||||
considered legacy and may be deprecated in the future. You should
|
||||
instead pass them via the `inputs` parameter.
|
||||
:::
|
||||
"""
|
||||
runner_type = self.llm_engine.model_config.runner_type
|
||||
if runner_type != "pooling":
|
||||
@@ -992,7 +996,7 @@ class LLM:
|
||||
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See :class:`~vllm.inputs.PromptType`
|
||||
for batch inference. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each prompts.
|
||||
pooling_params: The pooling parameters for pooling. If None, we
|
||||
use the default pooling parameters.
|
||||
@@ -1036,7 +1040,7 @@ class LLM:
|
||||
|
||||
Args:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See :class:`~vllm.inputs.PromptType`
|
||||
for batch inference. See {class}`~vllm.inputs.PromptType`
|
||||
for more details about the format of each prompts.
|
||||
use_tqdm: Whether to use tqdm to display the progress bar.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
@@ -1168,7 +1172,7 @@ class LLM:
|
||||
text_1: can be a single prompt or a list of prompts, in which
|
||||
case it has to have the same length as the ``text_2`` list
|
||||
text_2: The texts to pair with the query to form the input
|
||||
to the LLM. See :class:`~vllm.inputs.PromptType` for
|
||||
to the LLM. See {class}`~vllm.inputs.PromptType` for
|
||||
more details about the format of each prompts.
|
||||
use_tqdm: Whether to use tqdm to display the progress bar.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
@@ -1277,7 +1281,7 @@ class LLM:
|
||||
|
||||
def wake_up(self, tags: Optional[list[str]] = None):
|
||||
"""
|
||||
Wake up the engine from sleep mode. See the :meth:`sleep` method
|
||||
Wake up the engine from sleep mode. See the {meth}`sleep` method
|
||||
for more details.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from argparse import Namespace
|
||||
from typing import Annotated, Any, ClassVar, Literal, Optional, Union
|
||||
|
||||
import torch
|
||||
@@ -25,23 +24,7 @@ from vllm.utils import random_uuid, resolve_obj_by_qualname
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# torch is mocked during docs generation,
|
||||
# so we have to provide the values as literals
|
||||
_MOCK_LONG_INFO = Namespace(min=-9223372036854775808, max=9223372036854775807)
|
||||
_LONG_INFO: Union["torch.iinfo", Namespace]
|
||||
|
||||
try:
|
||||
from sphinx.ext.autodoc.mock import _MockModule
|
||||
|
||||
if isinstance(torch, _MockModule):
|
||||
_LONG_INFO = _MOCK_LONG_INFO
|
||||
else:
|
||||
_LONG_INFO = torch.iinfo(torch.long)
|
||||
except ModuleNotFoundError:
|
||||
_LONG_INFO = torch.iinfo(torch.long)
|
||||
|
||||
assert _LONG_INFO.min == _MOCK_LONG_INFO.min
|
||||
assert _LONG_INFO.max == _MOCK_LONG_INFO.max
|
||||
_LONG_INFO = torch.iinfo(torch.long)
|
||||
|
||||
|
||||
class OpenAIBaseModel(BaseModel):
|
||||
|
||||
@@ -275,7 +275,7 @@ class OpenAIServing:
|
||||
add_special_tokens: bool = True,
|
||||
) -> TextTokensPrompt:
|
||||
"""
|
||||
A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
|
||||
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
|
||||
that assumes single input.
|
||||
"""
|
||||
return next(
|
||||
@@ -296,7 +296,7 @@ class OpenAIServing:
|
||||
add_special_tokens: bool = True,
|
||||
) -> Iterator[TextTokensPrompt]:
|
||||
"""
|
||||
A simpler implementation of :meth:`_tokenize_prompt_input_or_inputs`
|
||||
A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
|
||||
that assumes multiple inputs.
|
||||
"""
|
||||
for text in prompt_inputs:
|
||||
|
||||
Reference in New Issue
Block a user