[Docs] Fix warnings in mkdocs build (continued) (#24791)

Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-09-13 16:13:44 +09:00
parent 5febdc8750
commit 9a8966bcc2
27 changed files with 102 additions and 110 deletions
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -823,7 +823,7 @@ class SupportsEagle3(Protocol):
        
        Args:
            layers: Tuple of layer indices that should output auxiliary
-              hidden states.
+                hidden states.
        """
        ...

--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -1520,15 +1520,9 @@ class BaseKeyeModule(nn.Module):
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
-                otherwise it will be `(seq_len,).
-            pixel_values: Pixel values to be fed to a model.
-                `None` if no images are passed.
-            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
-                `None` if no images are passed.
-            pixel_values_videos: Pixel values of videos to be fed to a model.
-                `None` if no videos are passed.
-            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
-                `None` if no videos are passed.
+                otherwise it will be `(seq_len,)`.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
        """
        if intermediate_tensors is not None:
            inputs_embeds = None
--- a/vllm/model_executor/models/keye_vl1_5.py
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -58,17 +58,18 @@ def split_thw(grid_thw: torch.Tensor) -> torch.Tensor:
    return torch.cat([ones, h_w], dim=1).repeat_interleave(t, dim=0)


-def get_num_patches(grid_thw: torch.Tensor, num_frames: Union[list[int],
-                                                              torch.Tensor]):
+def get_num_patches(grid_thw: torch.Tensor,
+                    num_frames: Union[list[int], torch.Tensor]) -> list[int]:
    """
    Return num_patches per video.

    Args:
-        t: tensor with shape [N, ...] where each item is a list/tensor
-        cu_seqlens: list indicating the boundaries of groups
+        grid_thw: Tensor with shape [N, 3] containing temporal, height, width
+            dimensions
+        num_frames: List or tensor indicating the number of frames per video

    Returns:
-        list of ints representing the sum of products for each group
+        List of ints representing the number of patches for each video

    Examples:
        >>> # Suppose there are 2 videos with a total of 3 grids
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -732,7 +732,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
-            pixel_values: The pixels in each input image.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.

        Info:
            [LlavaImageInputs][]
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -535,8 +535,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
-            pixel_values: The pixels in each grid patch for each input image.
-            image_sizes: The original `(height, width)` for each input image.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.

        Info:
            [LlavaNextImageInputs][]
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -578,7 +578,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
-            pixel_values: The pixels in each input image.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.

        Info:
            [Mistral3ImagePixelInputs][]
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -387,11 +387,10 @@ class Llama4VisionEncoder(nn.Module):
    ) -> torch.Tensor:
        r"""
        Args:
-            inputs_embeds (`torch.FloatTensor` of shape
-                    `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to
-                directly pass an embedded representation. This is useful if you
-                want more control over how to convert `input_ids` indices into
+            hidden_states: Input tensor of shape 
+                (batch_size, sequence_length, hidden_size).
+                Hidden states from the model embeddings, representing 
+                the input tokens.
                associated vectors than the model's internal embedding
                lookup matrix.
        """
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -70,11 +70,15 @@ def multihead_attention(
    v: torch.Tensor,
    q_cu_seqlens: Optional[torch.Tensor] = None,
    k_cu_seqlens: Optional[torch.Tensor] = None,
-):
+) -> torch.Tensor:
    """Multi-head attention using flash attention 2.

    Args:
-        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+        q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
            or (tot_seqlens, num_heads, head_dim) if packing.
        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
            The first element should be 0 and the last element should be q.shape[0].
@@ -123,8 +127,14 @@ def sdpa_attention(
    """SDPA attention.

    Args:
-        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+        q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
            or (tot_seqlens, num_heads, head_dim) if packing.
+        k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        q_cu_seqlens: Optional cumulative sequence lengths of q.
+        k_cu_seqlens: Optional cumulative sequence lengths of k.
    """
    seq_length = q.shape[0]
    attention_mask = torch.zeros([1, seq_length, seq_length],
@@ -387,7 +397,7 @@ class MLP2(nn.Module):
    def __init__(self,
                 dims: list[int],
                 activation,
-                 bias=True,
+                 bias: bool = True,
                 prefix: str = "",
                 use_data_parallel: bool = False):
        super().__init__()
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -374,8 +374,8 @@ class Phi4MMAudioMeanVarianceNormLayer(nn.Module):
    Typically used as a very first layer in a model.

    Args:
-        input_size: int
-            layer input size.
+        config: [Phi4MultimodalAudioConfig](https://huggingface.co/docs/transformers/model_doc/phi4_multimodal#transformers.Phi4MultimodalAudioConfig) 
+            object containing model parameters.
    """

    def __init__(self, config: Phi4MultimodalAudioConfig):
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1372,15 +1372,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
-                otherwise it will be `(seq_len,).
-            pixel_values: Pixel values to be fed to a model.
-                `None` if no images are passed.
-            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
-                `None` if no images are passed.
-            pixel_values_videos: Pixel values of videos to be fed to a model.
-                `None` if no videos are passed.
-            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
-                `None` if no videos are passed.
+                otherwise it will be `(seq_len,)`.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
        """

        if intermediate_tensors is not None:
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -390,12 +390,9 @@ class Siglip2EncoderLayer(nn.Module):
                position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]:
        """
        Args:
-            hidden_states (`torch.FloatTensor`):
-                Input to the layer of shape `(batch, seq_len, embed_dim)`.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all 
-                attention layers. See `attentions` under
-                returned tensors for more detail.
+            hidden_states: Input tensor of shape (batch, seq_len, embed_dim).
+            cu_seqlens: Cumulative sequence lengths tensor.
+            position_embeddings: Position embeddings tensor.
        """
        residual = hidden_states

@@ -534,19 +531,11 @@ class Siglip2Encoder(nn.Module):
    ) -> torch.Tensor:
        r"""
        Args:
-            inputs_embeds (`torch.FloatTensor` of shape
-                `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to
-                directly pass an embedded representation. This is useful if
-                you want more control over how to convert `input_ids` indices
-                into associated vectors than the model's internal embedding
-                lookup matrix.
-            grid_thws (`torch.LongTensor`):
-                grid shape (num_patches, 3)
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See
-                `hidden_states` under returned tensors for more detail.
-            return_dict (`bool`, *optional*):
+            inputs_embeds: Input tensor of shape 
+                (batch_size, sequence_length, hidden_size).
+                Embedded representation of the input tokens.
+            grid_thws: Grid tensor of shape (num_patches, 3) 
+                containing grid dimensions.
                Whether or not to return a [`~utils.ModelOutput`] instead of
                a plain tuple.
        """
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -597,10 +597,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
        with the `input_ids`.

        Args:
-            audio_features: A batch of audio input chunks [B, N, 80, M].
-            audio_lens: Length of audio frames for each audio chunk [B].
-            audio_token_len: Length of audio tokens for each audio chunk [B'].
-                Note: batch dim is different from batch dim in audio chunks.
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.

        """

--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -909,8 +909,8 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
            prefix: Optional prefix for parameter names
        
        Raises:
-            AssertionError: If prefix caching is enabled
-            (not supported by Mamba)
+            AssertionError: If prefix caching is enabled 
+                (not supported by Mamba)
        """
        config = vllm_config.model_config.hf_config
        cache_config = vllm_config.cache_config