Add full API docs and improve the UX of navigating them (#17485)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-05-04 03:42:43 +01:00
parent 46fae69cf0
commit d6484ef3c3
101 changed files with 872 additions and 980 deletions
--- a/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm/transformers_utils/configs/dbrx.py
@@ -196,8 +196,7 @@ class DbrxConfig(PretrainedConfig):
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabling this will also
-            allow the model to output the auxiliary loss. See [here]() for more details
+            Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss.
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.

--- a/vllm/transformers_utils/configs/exaone.py
+++ b/vllm/transformers_utils/configs/exaone.py
@@ -35,22 +35,22 @@ class ExaoneConfig(PretrainedConfig):
    Instantiating a configuration with the defaults will yield a similar
    configuration to that of the Exaone

-    Configuration objects inherit from :class:`~transformers.PretrainedConfig`
+    Configuration objects inherit from {class}`~transformers.PretrainedConfig`
    and can be used to control the model outputs. Read the documentation from :
    class:`~transformers.PretrainedConfig` for more information.

    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50257):
+        vocab_size ({obj}`int`, `optional`, defaults to 50257):
            Vocabulary size of the GPT Lingvo model. Defines the number of
-            different tokens that can be represented by the :obj:`inputs_ids`
-            passed when calling :class:`~transformers.ExaoneModel`. Vocabulary
+            different tokens that can be represented by the {obj}`inputs_ids`
+            passed when calling {class}`~transformers.ExaoneModel`. Vocabulary
            size of the model.
            Defines the different tokens that can be represented by the
            `inputs_ids` passed to the forward method of :class:
            `~transformers.EXAONEModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 2048):
+        hidden_size ({obj}`int`, `optional`, defaults to 2048):
            Dimensionality of the encoder layers and the pooler layer.
-        num_layers (:obj:`int`, `optional`, defaults to 24):
+        num_layers ({obj}`int`, `optional`, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the
@@ -68,37 +68,37 @@ class ExaoneConfig(PretrainedConfig):
            specified, will default to `num_attention_heads`.
        rotary_pct (`float`, *optional*, defaults to 0.25):
            percentage of hidden dimensions to allocate to rotary embeddings
-        intermediate_size (:obj:`int`, `optional`, defaults to 8192):
+        intermediate_size ({obj}`int`, `optional`, defaults to 8192):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in
            the Transformer encoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`,
-        defaults to :obj:`"gelu_new"`):
+        activation_function ({obj}`str` or {obj}`function`, `optional`,
+        defaults to {obj}`"gelu_new"`):
            The non-linear activation function (function or string) in the
-            encoder and pooler. If string, :obj:`"gelu"`, :obj:`"relu"`,
-            :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            encoder and pooler. If string, {obj}`"gelu"`, {obj}`"relu"`,
+            {obj}`"selu"` and {obj}`"gelu_new"` are supported.
+        embed_dropout ({obj}`float`, `optional`, defaults to 0.0):
            The dropout probabilitiy for all fully connected layers in the
            embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout ({obj}`float`, `optional`, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
+        max_position_embeddings ({obj}`int`, `optional`, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
            Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling
-            :class:`~transformers.EXAONEModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size ({obj}`int`, `optional`, defaults to 2):
+            The vocabulary size of the {obj}`token_type_ids` passed when calling
+            {class}`~transformers.EXAONEModel`.
+        initializer_range ({obj}`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+        layer_norm_epsilon ({obj}`float`, `optional`, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache ({obj}`bool`, `optional`, defaults to {obj}`True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models).
            Only relevant if ``config.is_decoder=True``.
-        gradient_checkpointing (:obj:`bool`, `optional`,
-        defaults to :obj:`False`):
+        gradient_checkpointing ({obj}`bool`, `optional`,
+        defaults to {obj}`False`):
            If True, use gradient checkpointing to save memory at the expense
            of slower backward pass.
        Example::