diff --git a/mkdocs.yaml b/mkdocs.yaml index d5d6852f3..ecc0ab692 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -63,8 +63,9 @@ plugins: - git-revision-date-localized: # exclude autogenerated files exclude: - - argparse/* + - api/* - examples/* + - generated/* - minify: minify_html: true minify_js: true @@ -92,7 +93,6 @@ plugins: - "!.*_pb2_grpc" # Exclude auto-generated gRPC stubs summary: modules: true - show_if_no_docstring: true show_signature_annotations: true separate_signature: true show_overloads: true diff --git a/vllm/config/model.py b/vllm/config/model.py index 749af0d5d..5fd7d2d73 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1557,6 +1557,7 @@ class ModelConfig: @property def attn_type(self) -> AttnTypeStr: + """Determine the attention type based on model configuration.""" if self.pooler_config is not None: seq_pooling_type = self._model_info.default_seq_pooling_type if seq_pooling_type == "CLS": diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index ede027759..fc1cea023 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -4,3 +4,4 @@ from vllm.v1.engine.async_llm import AsyncLLM AsyncLLMEngine = AsyncLLM # type: ignore +"""The `AsyncLLMEngine` class is an alias of [vllm.v1.engine.async_llm.AsyncLLM][].""" diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index a0fe38eb3..419139c4b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -4,3 +4,4 @@ from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine LLMEngine = V1LLMEngine # type: ignore +"""The `LLMEngine` class is an alias of [vllm.v1.engine.llm_engine.LLMEngine][].""" diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 7848c2c03..157ab337e 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -298,6 +298,7 @@ which can be passed to SingletonInputs: TypeAlias = DecoderOnlyInputs | MultiModalEncDecInputs +"""The inputs for a single encoder/decoder prompt.""" @dataclass diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py index ee4798d84..e929074d5 100644 --- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py @@ -206,6 +206,8 @@ class SGLFusedMOE: class CPUFusedMOE: + """CPU-based fused MoE implementation.""" + def __init__(self, layer: torch.nn.Module) -> None: use_grouped_gemm, isa = self.check_grouped_gemm(layer) self.isa = isa diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index ac5a86067..77d439d32 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -376,6 +376,8 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): class CutlassExpertsFp8(CutlassExpertsFp8Base): + """CUTLASS FP8 fused MoE expert implementation.""" + @staticmethod def activation_format() -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard @@ -423,6 +425,8 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base): class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base): + """Batched CUTLASS FP8 fused MoE expert implementation.""" + @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: # BATCHED activation format works with EP because @@ -651,6 +655,8 @@ def run_cutlass_moe_fp4( class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): + """CUTLASS FP4 fused MoE expert implementation.""" + @property def expects_unquantized_inputs(self) -> bool: return True diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 00d55bfb7..59dde3ca9 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -113,6 +113,8 @@ def _valid_deep_gemm( class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): + """DeepGemm-based fused MoE expert implementation.""" + def __init__(self, moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig): super().__init__(moe_config=moe_config, quant_config=quant_config) assert quant_config.block_shape == get_mk_alignment_for_contiguous_layout() diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 5d382cfc9..3d3a21f81 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -637,6 +637,8 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute): class MarlinExperts(MarlinExpertsBase): + """Marlin-based fused MoE expert implementation.""" + def supports_expert_map(self) -> bool: return True @@ -738,6 +740,8 @@ class MarlinExperts(MarlinExpertsBase): class BatchedMarlinExperts(MarlinExpertsBase): + """Batched Marlin-based fused MoE expert implementation.""" + def __init__( self, moe_config: FusedMoEConfig, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 6ca3213fb..352288e17 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1527,6 +1527,7 @@ def fused_experts( expert_map: torch.Tensor | None = None, quant_config: FusedMoEQuantConfig | None = None, ) -> torch.Tensor: + """Run fused MoE expert computation using Triton kernels.""" if quant_config is None: quant_config = FUSED_MOE_UNQUANTIZED_CONFIG @@ -1879,6 +1880,8 @@ def fused_experts_impl( class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): + """Triton-based fused MoE expert implementation.""" + def __init__( self, moe_config: FusedMoEConfig, diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index eafdf97a9..5aaf2a8c3 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -221,6 +221,7 @@ def triton_kernel_fused_experts( intermediate_cache: torch.Tensor | None = None, a1q_scale: torch.Tensor | None = None, ) -> torch.Tensor: + """Triton implementation of fused expert computation using OAI kernels.""" if quant_config is None: quant_config = FUSED_MOE_UNQUANTIZED_CONFIG @@ -444,6 +445,8 @@ class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): class OAITritonExperts(BaseOAITritonExperts): + """OAI Triton-based fused MoE expert implementation.""" + @staticmethod def activation_format() -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 78b941498..289ac0d14 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -63,6 +63,8 @@ def pplx_hidden_dim_scale_bytes( class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + """PPLX-based prepare and finalize for expert parallelism.""" + def __init__( self, a2a: pplx.AllToAll, diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index d10476702..7b8dd3b77 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -131,6 +131,8 @@ class MoEPrepareAndFinalizeNaiveEP(mk.FusedMoEPrepareAndFinalize): class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): + """MoE prepare and finalize without expert parallelism.""" + @property def activation_format(self) -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 33150da6f..535abc420 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -192,6 +192,7 @@ def rocm_aiter_fused_experts( num_local_tokens: torch.Tensor | None = None, output_dtype: torch.dtype | None = None, ) -> torch.Tensor: + """ROCm AITER fused MoE expert computation.""" if quant_config is None: quant_config = FUSED_MOE_UNQUANTIZED_CONFIG diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py index aa7185040..074b8154a 100644 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -18,6 +18,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): + """TensorRT-LLM-based fused MoE expert implementation.""" + def __init__( self, moe_config: FusedMoEConfig, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 023cf3f67..690ff0454 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -680,6 +680,8 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): + """W8A8 FP8 MoE quantization using compressed tensors.""" + def __init__( self, weight_quant: QuantizationArgs, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 75501076a..5cd6d5d79 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -235,6 +235,8 @@ class Mxfp4Config(QuantizationConfig): class Mxfp4MoEMethod(FusedMoEMethodBase): + """MXFP4 MoE quantization method.""" + def __init__(self, moe: FusedMoEConfig): super().__init__(moe) self.weight_dtype = "mxfp4" diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 0441996f6..f812eb849 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -73,6 +73,7 @@ class Blip2ImageEmbeddingInputs(TensorSchema): Blip2ImageInputs: TypeAlias = Blip2ImagePixelInputs | Blip2ImageEmbeddingInputs +"""Alias for supported BLIP-2 image input types.""" class Blip2QFormerMultiHeadAttention(nn.Module): diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 2f9aaa3f3..c35728183 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -121,6 +121,7 @@ class LlavaImageEmbeddingInputs(TensorSchema): LlavaImageInputs: TypeAlias = ( LlavaImagePixelInputs | PixtralHFImagePixelInputs | LlavaImageEmbeddingInputs ) +"""Alias for supported LLaVA image input types.""" class LlavaMultiModalProjector(nn.Module): diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 9f83c7910..4ea58ce71 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -78,6 +78,7 @@ class LlavaNextImageEmbeddingInputs(TensorSchema): LlavaNextImageInputs: TypeAlias = ( LlavaNextImagePixelInputs | LlavaNextImageEmbeddingInputs ) +"""Alias for supported LLaVA-NeXT image input types.""" class LlavaNextLikeConfig(LlavaLikeConfig, Protocol): @@ -106,6 +107,7 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo): image_width: int, image_height: int, ) -> int: + """Get the number of image tokens for the given image dimensions.""" hf_config = self.get_hf_config() vision_encoder_info = self.get_vision_encoder_info() diff --git a/vllm/multimodal/processing/processor.py b/vllm/multimodal/processing/processor.py index 5f98cce3d..e1a164d4e 100644 --- a/vllm/multimodal/processing/processor.py +++ b/vllm/multimodal/processing/processor.py @@ -1110,6 +1110,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): self, mm_items: MultiModalDataItems, ) -> tuple[Mapping[str, object], Mapping[str, object]]: + """Extract processor and passthrough data from multi-modal items.""" processor_data = dict[str, object]() passthrough_data = dict[str, object]() @@ -1616,6 +1617,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): token_ids: list[int], mm_prompt_updates: MultiModalPromptUpdates, ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]: + """Apply multi-modal prompt updates to token IDs.""" tokenizer = self.info.get_tokenizer() new_token_ids, match_result = self._apply_token_matches( diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 45dde6e47..27f5ea517 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -35,6 +35,8 @@ def in_wsl() -> bool: class PlatformEnum(enum.Enum): + """Enumeration of supported hardware platforms.""" + CUDA = enum.auto() ROCM = enum.auto() TPU = enum.auto() diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 4c59d5364..89fadad7a 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -26,6 +26,7 @@ plugins_loaded = False def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]: + """Load plugins registered under the given entry point group.""" from importlib.metadata import entry_points allowed_plugins = envs.VLLM_PLUGINS diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py index a978b1e74..fa71b4ca0 100644 --- a/vllm/plugins/io_processors/interface.py +++ b/vllm/plugins/io_processors/interface.py @@ -16,6 +16,8 @@ IOProcessorOutput = TypeVar("IOProcessorOutput") class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): + """Abstract interface for pre/post-processing of engine I/O.""" + def __init__(self, vllm_config: VllmConfig): super().__init__() diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index bb4fffb69..072d2a164 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -69,6 +69,8 @@ class InputStreamError(Exception): class AsyncLLM(EngineClient): + """An asynchronous wrapper for the vLLM engine.""" + def __init__( self, vllm_config: VllmConfig,