[Docs] Reduce time spent generating API docs (#34255)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -63,8 +63,9 @@ plugins:
|
||||
- git-revision-date-localized:
|
||||
# exclude autogenerated files
|
||||
exclude:
|
||||
- argparse/*
|
||||
- api/*
|
||||
- examples/*
|
||||
- generated/*
|
||||
- minify:
|
||||
minify_html: true
|
||||
minify_js: true
|
||||
@@ -92,7 +93,6 @@ plugins:
|
||||
- "!.*_pb2_grpc" # Exclude auto-generated gRPC stubs
|
||||
summary:
|
||||
modules: true
|
||||
show_if_no_docstring: true
|
||||
show_signature_annotations: true
|
||||
separate_signature: true
|
||||
show_overloads: true
|
||||
|
||||
@@ -1557,6 +1557,7 @@ class ModelConfig:
|
||||
|
||||
@property
|
||||
def attn_type(self) -> AttnTypeStr:
|
||||
"""Determine the attention type based on model configuration."""
|
||||
if self.pooler_config is not None:
|
||||
seq_pooling_type = self._model_info.default_seq_pooling_type
|
||||
if seq_pooling_type == "CLS":
|
||||
|
||||
@@ -4,3 +4,4 @@
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
|
||||
AsyncLLMEngine = AsyncLLM # type: ignore
|
||||
"""The `AsyncLLMEngine` class is an alias of [vllm.v1.engine.async_llm.AsyncLLM][]."""
|
||||
|
||||
@@ -4,3 +4,4 @@
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
||||
|
||||
LLMEngine = V1LLMEngine # type: ignore
|
||||
"""The `LLMEngine` class is an alias of [vllm.v1.engine.llm_engine.LLMEngine][]."""
|
||||
|
||||
@@ -298,6 +298,7 @@ which can be passed to
|
||||
|
||||
|
||||
SingletonInputs: TypeAlias = DecoderOnlyInputs | MultiModalEncDecInputs
|
||||
"""The inputs for a single encoder/decoder prompt."""
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -206,6 +206,8 @@ class SGLFusedMOE:
|
||||
|
||||
|
||||
class CPUFusedMOE:
|
||||
"""CPU-based fused MoE implementation."""
|
||||
|
||||
def __init__(self, layer: torch.nn.Module) -> None:
|
||||
use_grouped_gemm, isa = self.check_grouped_gemm(layer)
|
||||
self.isa = isa
|
||||
|
||||
@@ -376,6 +376,8 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
|
||||
|
||||
class CutlassExpertsFp8(CutlassExpertsFp8Base):
|
||||
"""CUTLASS FP8 fused MoE expert implementation."""
|
||||
|
||||
@staticmethod
|
||||
def activation_format() -> mk.FusedMoEActivationFormat:
|
||||
return mk.FusedMoEActivationFormat.Standard
|
||||
@@ -423,6 +425,8 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
|
||||
|
||||
|
||||
class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
|
||||
"""Batched CUTLASS FP8 fused MoE expert implementation."""
|
||||
|
||||
@staticmethod
|
||||
def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
|
||||
# BATCHED activation format works with EP because
|
||||
@@ -651,6 +655,8 @@ def run_cutlass_moe_fp4(
|
||||
|
||||
|
||||
class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
"""CUTLASS FP4 fused MoE expert implementation."""
|
||||
|
||||
@property
|
||||
def expects_unquantized_inputs(self) -> bool:
|
||||
return True
|
||||
|
||||
@@ -113,6 +113,8 @@ def _valid_deep_gemm(
|
||||
|
||||
|
||||
class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
"""DeepGemm-based fused MoE expert implementation."""
|
||||
|
||||
def __init__(self, moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig):
|
||||
super().__init__(moe_config=moe_config, quant_config=quant_config)
|
||||
assert quant_config.block_shape == get_mk_alignment_for_contiguous_layout()
|
||||
|
||||
@@ -637,6 +637,8 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
|
||||
|
||||
class MarlinExperts(MarlinExpertsBase):
|
||||
"""Marlin-based fused MoE expert implementation."""
|
||||
|
||||
def supports_expert_map(self) -> bool:
|
||||
return True
|
||||
|
||||
@@ -738,6 +740,8 @@ class MarlinExperts(MarlinExpertsBase):
|
||||
|
||||
|
||||
class BatchedMarlinExperts(MarlinExpertsBase):
|
||||
"""Batched Marlin-based fused MoE expert implementation."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
moe_config: FusedMoEConfig,
|
||||
|
||||
@@ -1527,6 +1527,7 @@ def fused_experts(
|
||||
expert_map: torch.Tensor | None = None,
|
||||
quant_config: FusedMoEQuantConfig | None = None,
|
||||
) -> torch.Tensor:
|
||||
"""Run fused MoE expert computation using Triton kernels."""
|
||||
if quant_config is None:
|
||||
quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
|
||||
|
||||
@@ -1879,6 +1880,8 @@ def fused_experts_impl(
|
||||
|
||||
|
||||
class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
"""Triton-based fused MoE expert implementation."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
moe_config: FusedMoEConfig,
|
||||
|
||||
@@ -221,6 +221,7 @@ def triton_kernel_fused_experts(
|
||||
intermediate_cache: torch.Tensor | None = None,
|
||||
a1q_scale: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
"""Triton implementation of fused expert computation using OAI kernels."""
|
||||
if quant_config is None:
|
||||
quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
|
||||
|
||||
@@ -444,6 +445,8 @@ class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
|
||||
|
||||
class OAITritonExperts(BaseOAITritonExperts):
|
||||
"""OAI Triton-based fused MoE expert implementation."""
|
||||
|
||||
@staticmethod
|
||||
def activation_format() -> mk.FusedMoEActivationFormat:
|
||||
return mk.FusedMoEActivationFormat.Standard
|
||||
|
||||
@@ -63,6 +63,8 @@ def pplx_hidden_dim_scale_bytes(
|
||||
|
||||
|
||||
class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
"""PPLX-based prepare and finalize for expert parallelism."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
a2a: pplx.AllToAll,
|
||||
|
||||
@@ -131,6 +131,8 @@ class MoEPrepareAndFinalizeNaiveEP(mk.FusedMoEPrepareAndFinalize):
|
||||
|
||||
|
||||
class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
|
||||
"""MoE prepare and finalize without expert parallelism."""
|
||||
|
||||
@property
|
||||
def activation_format(self) -> mk.FusedMoEActivationFormat:
|
||||
return mk.FusedMoEActivationFormat.Standard
|
||||
|
||||
@@ -192,6 +192,7 @@ def rocm_aiter_fused_experts(
|
||||
num_local_tokens: torch.Tensor | None = None,
|
||||
output_dtype: torch.dtype | None = None,
|
||||
) -> torch.Tensor:
|
||||
"""ROCm AITER fused MoE expert computation."""
|
||||
if quant_config is None:
|
||||
quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
|
||||
|
||||
|
||||
@@ -18,6 +18,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
|
||||
|
||||
class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
"""TensorRT-LLM-based fused MoE expert implementation."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
moe_config: FusedMoEConfig,
|
||||
|
||||
@@ -680,6 +680,8 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
|
||||
|
||||
|
||||
class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
||||
"""W8A8 FP8 MoE quantization using compressed tensors."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
weight_quant: QuantizationArgs,
|
||||
|
||||
@@ -235,6 +235,8 @@ class Mxfp4Config(QuantizationConfig):
|
||||
|
||||
|
||||
class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
"""MXFP4 MoE quantization method."""
|
||||
|
||||
def __init__(self, moe: FusedMoEConfig):
|
||||
super().__init__(moe)
|
||||
self.weight_dtype = "mxfp4"
|
||||
|
||||
@@ -73,6 +73,7 @@ class Blip2ImageEmbeddingInputs(TensorSchema):
|
||||
|
||||
|
||||
Blip2ImageInputs: TypeAlias = Blip2ImagePixelInputs | Blip2ImageEmbeddingInputs
|
||||
"""Alias for supported BLIP-2 image input types."""
|
||||
|
||||
|
||||
class Blip2QFormerMultiHeadAttention(nn.Module):
|
||||
|
||||
@@ -121,6 +121,7 @@ class LlavaImageEmbeddingInputs(TensorSchema):
|
||||
LlavaImageInputs: TypeAlias = (
|
||||
LlavaImagePixelInputs | PixtralHFImagePixelInputs | LlavaImageEmbeddingInputs
|
||||
)
|
||||
"""Alias for supported LLaVA image input types."""
|
||||
|
||||
|
||||
class LlavaMultiModalProjector(nn.Module):
|
||||
|
||||
@@ -78,6 +78,7 @@ class LlavaNextImageEmbeddingInputs(TensorSchema):
|
||||
LlavaNextImageInputs: TypeAlias = (
|
||||
LlavaNextImagePixelInputs | LlavaNextImageEmbeddingInputs
|
||||
)
|
||||
"""Alias for supported LLaVA-NeXT image input types."""
|
||||
|
||||
|
||||
class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
|
||||
@@ -106,6 +107,7 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
"""Get the number of image tokens for the given image dimensions."""
|
||||
hf_config = self.get_hf_config()
|
||||
vision_encoder_info = self.get_vision_encoder_info()
|
||||
|
||||
|
||||
@@ -1110,6 +1110,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
) -> tuple[Mapping[str, object], Mapping[str, object]]:
|
||||
"""Extract processor and passthrough data from multi-modal items."""
|
||||
processor_data = dict[str, object]()
|
||||
passthrough_data = dict[str, object]()
|
||||
|
||||
@@ -1616,6 +1617,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
token_ids: list[int],
|
||||
mm_prompt_updates: MultiModalPromptUpdates,
|
||||
) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
|
||||
"""Apply multi-modal prompt updates to token IDs."""
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
|
||||
new_token_ids, match_result = self._apply_token_matches(
|
||||
|
||||
@@ -35,6 +35,8 @@ def in_wsl() -> bool:
|
||||
|
||||
|
||||
class PlatformEnum(enum.Enum):
|
||||
"""Enumeration of supported hardware platforms."""
|
||||
|
||||
CUDA = enum.auto()
|
||||
ROCM = enum.auto()
|
||||
TPU = enum.auto()
|
||||
|
||||
@@ -26,6 +26,7 @@ plugins_loaded = False
|
||||
|
||||
|
||||
def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]:
|
||||
"""Load plugins registered under the given entry point group."""
|
||||
from importlib.metadata import entry_points
|
||||
|
||||
allowed_plugins = envs.VLLM_PLUGINS
|
||||
|
||||
@@ -16,6 +16,8 @@ IOProcessorOutput = TypeVar("IOProcessorOutput")
|
||||
|
||||
|
||||
class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
|
||||
"""Abstract interface for pre/post-processing of engine I/O."""
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
super().__init__()
|
||||
|
||||
|
||||
@@ -69,6 +69,8 @@ class InputStreamError(Exception):
|
||||
|
||||
|
||||
class AsyncLLM(EngineClient):
|
||||
"""An asynchronous wrapper for the vLLM engine."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
|
||||
Reference in New Issue
Block a user