[UX] Add vLLM model inspection view (#29450)

Signed-off-by: mgoin <mgoin64@gmail.com>
2026-01-09 12:12:35 -05:00
parent 08d954f036
commit d5ec6c056f
6 changed files with 180 additions and 1 deletions
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -348,6 +348,9 @@ class LLM:
        self.input_processor = self.llm_engine.input_processor
        self.io_processor = self.llm_engine.io_processor

+        # Cache for __repr__ to avoid repeated collective_rpc calls
+        self._cached_repr: str | None = None
+
    def get_tokenizer(self) -> TokenizerLike:
        return self.llm_engine.get_tokenizer()

@@ -1786,3 +1789,16 @@ class LLM:
        # This is necessary because some requests may be finished earlier than
        # its previous requests.
        return sorted(outputs, key=lambda x: int(x.request_id))
+
+    def __repr__(self) -> str:
+        """Return a transformers-style hierarchical view of the model."""
+        # Cache the result to avoid repeated collective_rpc calls
+        if self._cached_repr is None:
+            results = self.llm_engine.collective_rpc("get_model_inspection")
+            # In distributed settings, we get results from all workers
+            # Just return the first one (they should all be the same)
+            if results:
+                self._cached_repr = results[0]
+            else:
+                self._cached_repr = f"LLM(model={self.model_config.model!r})"
+        return self._cached_repr
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -250,6 +250,7 @@ if TYPE_CHECKING:
    VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
    VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
    VLLM_USE_V2_MODEL_RUNNER: bool = False
+    VLLM_LOG_MODEL_INSPECTION: bool = False
    VLLM_DEBUG_MFU_METRICS: bool = False


@@ -1595,6 +1596,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_V2_MODEL_RUNNER": lambda: bool(
        int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0"))
    ),
+    # Log model inspection after loading.
+    # If enabled, logs a transformers-style hierarchical view of the model
+    # with quantization methods and attention backends.
+    "VLLM_LOG_MODEL_INSPECTION": lambda: bool(
+        int(os.getenv("VLLM_LOG_MODEL_INSPECTION", "0"))
+    ),
    # Debug logging for --enable-mfu-metrics
    "VLLM_DEBUG_MFU_METRICS": lambda: bool(
        int(os.getenv("VLLM_DEBUG_MFU_METRICS", "0"))
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -285,5 +285,5 @@ class ApplyRotaryEmb(CustomOp):

    def extra_repr(self) -> str:
        s = f"is_neox_style={self.is_neox_style}"
-        s += f"enable_fp32_compute={self.enable_fp32_compute}"
+        s += f", enable_fp32_compute={self.enable_fp32_compute}"
        return s
--- a/vllm/model_executor/model_loader/base_loader.py
+++ b/vllm/model_executor/model_loader/base_loader.py
@@ -5,6 +5,7 @@ from abc import ABC, abstractmethod
 import torch
 import torch.nn as nn

+import vllm.envs as envs
 from vllm.config import ModelConfig, VllmConfig
 from vllm.config.load import LoadConfig
 from vllm.logger import init_logger
@@ -50,8 +51,21 @@ class BaseModelLoader(ABC):
                    vllm_config=vllm_config, model_config=model_config
                )

+            log_model_inspection(model)
+
            logger.debug("Loading weights on %s ...", load_device)
            # Quantization does not happen in `load_weights` but after it
            self.load_weights(model, model_config)
            process_weights_after_loading(model, model_config, target_device)
+
        return model.eval()
+
+
+def log_model_inspection(model: nn.Module) -> None:
+    """Log model structure if VLLM_LOG_MODEL_INSPECTION=1."""
+    if not envs.VLLM_LOG_MODEL_INSPECTION:
+        return
+
+    from vllm.model_inspection import format_model_inspection
+
+    logger.info("vLLM model structure:\n%s", format_model_inspection(model))
--- a/vllm/model_inspection.py
+++ b/vllm/model_inspection.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Model inspection utilities for vLLM."""
+
+import torch.nn as nn
+
+
+def _get_module_info(module: nn.Module) -> str:
+    """Get info string for a module."""
+    class_name = type(module).__name__
+    parts = []
+
+    # Add quant_method if present
+    quant_method = getattr(module, "quant_method", None)
+    if quant_method is not None:
+        quant_name = type(quant_method).__name__
+        # For CompressedTensors, show the underlying scheme instead
+        scheme = getattr(module, "scheme", None)
+        if scheme is not None:
+            quant_name = type(scheme).__name__
+        # Skip unquantized methods
+        if "Unquantized" not in quant_name:
+            parts.append(f"quant={quant_name}")
+
+    # If module has extra_repr, use it
+    if hasattr(module, "extra_repr"):
+        parts.append(module.extra_repr().replace("\n", ""))
+
+    if parts:
+        return f"{class_name}({', '.join(parts)})"
+
+    # For unknown modules, use the default PyTorch repr
+    return str(module)
+
+
+def _get_child_signature(child: nn.Module) -> str:
+    """Get a signature for a child module to detect duplicates."""
+    lines = []
+    for name, submodule in child.named_modules():
+        lines.append(f"{name}:{_get_module_info(submodule)}")
+    return "\n".join(lines)
+
+
+def _format_index_ranges(indices: list[int]) -> str:
+    """Format indices into range notation (e.g., [0,1,2,4,5,6] -> '0-2, 4-6')."""
+    indices = sorted(indices)
+    ranges = []
+    start = end = indices[0]
+
+    for idx in indices[1:]:
+        if idx == end + 1:
+            end = idx
+        else:
+            ranges.append(str(start) if start == end else f"{start}-{end}")
+            start = end = idx
+
+    ranges.append(str(start) if start == end else f"{start}-{end}")
+    return ", ".join(ranges)
+
+
+def _format_module_tree(
+    module: nn.Module,
+    name: str = "",
+    indent: int = 0,
+) -> list[str]:
+    """Format a module tree with indentation, grouping identical layers.
+
+    Produces output like:
+        (layers): ModuleList(
+          (0-27, 29-47): 47 x LlamaDecoderLayer(
+            ...
+          )
+          (28, 48): 2 x DifferentDecoderLayer(
+            ...
+          )
+        )
+    """
+    lines = []
+    prefix = "  " * indent
+    children = list(module.named_children())
+
+    # Leaf node - just output the module info
+    if not children:
+        info = _get_module_info(module)
+        lines.append(f"{prefix}({name}): {info}" if name else f"{prefix}{info}")
+        return lines
+
+    # Non-leaf node - output opening line and recurse into children
+    info = _get_module_info(module)
+    lines.append(f"{prefix}({name}): {info}(" if name else f"{prefix}{info}(")
+
+    # Separate numbered children (e.g., "0", "1") from named ones (e.g., "norm")
+    numbered: list[tuple[int, nn.Module]] = []
+    non_numbered: list[tuple[str, nn.Module]] = []
+    for child_name, child_module in children:
+        try:
+            numbered.append((int(child_name), child_module))
+        except ValueError:
+            non_numbered.append((child_name, child_module))
+
+    # Group numbered children by structure signature to collapse identical layers
+    # e.g., layers 0-27 and 29-47 with same structure become "(0-27, 29-47): 47 x"
+    if numbered:
+        sig_to_group: dict[str, list[tuple[int, nn.Module]]] = {}
+        for idx, child_module in numbered:
+            sig = _get_child_signature(child_module)
+            sig_to_group.setdefault(sig, []).append((idx, child_module))
+
+        # Output groups sorted by first index
+        for group in sorted(sig_to_group.values(), key=lambda g: g[0][0]):
+            indices = [idx for idx, _ in group]
+            representative = group[0][1]
+            child_lines = _format_module_tree(representative, "", indent + 1)
+            first_line = child_lines[0].lstrip()
+            child_prefix = "  " * (indent + 1)
+
+            if len(indices) > 1:
+                range_str = _format_index_ranges(indices)
+                child_lines[0] = (
+                    f"{child_prefix}({range_str}): {len(indices)} x {first_line}"
+                )
+            else:
+                child_lines[0] = f"{child_prefix}({indices[0]}): {first_line}"
+            lines.extend(child_lines)
+
+    # Output non-numbered children (e.g., "embed_tokens", "norm")
+    for child_name, child_module in non_numbered:
+        lines.extend(_format_module_tree(child_module, child_name, indent + 1))
+
+    lines.append(f"{prefix})")
+    return lines
+
+
+def format_model_inspection(model: nn.Module) -> str:
+    """Format a model into a transformers-style hierarchical string."""
+    return "\n".join(_format_module_tree(model))
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -118,6 +118,12 @@ class WorkerBase:
        """Apply a function on the model inside this worker."""
        return fn(self.get_model())

+    def get_model_inspection(self) -> str:
+        """Return a transformers-style hierarchical view of the model."""
+        from vllm.model_inspection import format_model_inspection
+
+        return format_model_inspection(self.get_model())
+
    def load_model(self) -> None:
        """Load model onto target device."""
        raise NotImplementedError