# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
MkDocs hook to automatically convert inline code references to API doc links.

For example, `WeightTransferConfig` becomes
[`WeightTransferConfig`][vllm.config.WeightTransferConfig]

This works with the `autorefs` plugin to create clickable cross-references
to API documentation pages generated by `mkdocstrings`.

The hook builds an index of all documented public Python names (classes and
functions with docstrings) from the vllm package at startup using AST parsing,
then substitutes matching inline code spans on each page. Names without
docstrings are excluded because mkdocstrings will not generate a page for them.
"""

import ast
import logging
from pathlib import Path

import regex as re
from mkdocs.config.defaults import MkDocsConfig
from mkdocs.structure.files import Files
from mkdocs.structure.pages import Page

logger = logging.getLogger("mkdocs")

ROOT_DIR = Path(__file__).parent.parent.parent.parent.resolve()
VLLM_DIR = ROOT_DIR / "vllm"

# Maps short name -> qualified name (e.g. "ModelConfig" -> "vllm.config.ModelConfig")
_name_index: dict[str, str] = {}

# Fenced code block pattern (``` or ~~~, with optional language specifier).
_FENCED_BLOCK = re.compile(
    r"(?:^|\n)(?P<fence>`{3,}|~{3,})[^\n]*\n.*?(?:\n(?P=fence))", re.DOTALL
)

# Inline code that is NOT already part of a markdown link.
# Matches `Name` but not [`Name`] and not [`Name`][...] or [`Name`](...).
_INLINE_CODE = re.compile(
    r"(?<!\[)"  # not preceded by [
    r"`(?P<name>[A-Za-z0-9_]*)`"  # `UpperCamelCase` or `UPPER_SNAKE`
    r"(?!\])"  # not followed by ]
)


def _has_docstring(node: ast.AST) -> bool:
    """Check if a class or function node has a docstring."""
    if not isinstance(node, ast.ClassDef | ast.FunctionDef | ast.AsyncFunctionDef):
        return False
    return ast.get_docstring(node, clean=False) is not None


def _module_path(filepath: Path) -> str:
    """Convert a filesystem path to a dotted module path."""
    rel = filepath.relative_to(ROOT_DIR)
    parts = list(rel.with_suffix("").parts)
    if parts[-1] == "__init__":
        parts = parts[:-1]
    return ".".join(parts)


def _index_file(filepath: Path) -> dict[str, str]:
    """Extract documented public names from a Python file using AST parsing.

    Only classes and functions with docstrings are included, since
    mkdocstrings won't generate a page for undocumented symbols.
    """
    names: dict[str, str] = {}
    try:
        source = filepath.read_text(encoding="utf-8")
        tree = ast.parse(source, filename=str(filepath))
    except (SyntaxError, UnicodeDecodeError):
        return names

    module = _module_path(filepath)

    for node in ast.iter_child_nodes(tree):
        if (
            # Class definitions (with docstring)
            isinstance(node, ast.ClassDef)
            and not node.name.startswith("_")
            and _has_docstring(node)
        ) or (
            # Function definitions (with docstring, only uppercase/CamelCase)
            isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef)
            and not node.name.startswith("_")
            and node.name[0].isupper()
            and _has_docstring(node)
        ):
            names[node.name] = f"{module}.{node.name}"

    return names


def _build_index() -> dict[str, str]:
    """Walk the vllm package and build a name -> qualified path index."""
    index: dict[str, str] = {}
    # Track conflicts: if multiple modules define the same name,
    # prefer shallower modules (more likely to be the public API).
    depth: dict[str, int] = {}

    for filepath in sorted(VLLM_DIR.rglob("*.py")):
        # Skip internal/private modules
        if any(part.startswith("_") and part != "__init__" for part in filepath.parts):
            continue
        # Skip third-party vendored code
        rel = filepath.relative_to(VLLM_DIR)
        if rel.parts and rel.parts[0] in ("third_party", "vllm_flash_attn"):
            continue

        module_depth = len(filepath.relative_to(ROOT_DIR).parts)
        file_names = _index_file(filepath)

        for name, qualified in file_names.items():
            if name not in index or module_depth < depth[name]:
                index[name] = qualified
                depth[name] = module_depth

    return index


def on_startup(*, command: str, dirty: bool) -> None:
    """Build the name index once at startup."""
    global _name_index
    _name_index = _build_index()
    logger.info("autoref_code: indexed %d names from vllm/", len(_name_index))


def on_page_markdown(
    markdown: str, *, page: Page, config: MkDocsConfig, files: Files
) -> str:
    """Replace inline code references with autoref links."""
    if not _name_index:
        return markdown

    # Skip API reference pages to avoid circular/redundant links.
    if page.file.src_path.startswith("api/"):
        return markdown

    # Step 1: Mask fenced code blocks so we don't touch code inside them.
    masks: list[str] = []

    def _mask_block(match: re.Match) -> str:
        masks.append(match.group(0))
        return f"\ue000CODEBLOCK{len(masks) - 1}\ue000"

    masked = _FENCED_BLOCK.sub(_mask_block, markdown)

    # Step 2: Replace inline code references.
    def _replace(match: re.Match) -> str:
        name = match.group("name")
        qualified = _name_index.get(name)
        if qualified is None:
            return match.group(0)
        logger.debug("autoref_code: linking `%s` to [%s]", name, qualified)
        return f"[`{name}`][{qualified}]"

    result = _INLINE_CODE.sub(_replace, masked)

    # Step 3: Restore masked code blocks.
    result = re.sub(
        r"\ue000CODEBLOCK(\d+)\ue000", lambda m: masks[int(m.group(1))], result
    )
    return result