vllm/examples/offline_inference/routed_experts_e2e.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
End-to-end example for routed experts capture with hybrid models.

Validates that:
1. routed_experts is returned in CompletionOutput for MoE models.
2. Expert IDs are within valid range.
3. Results are deterministic across runs (baseline vs reference).

Usage:
    python examples/offline_inference/routed_experts_e2e.py \
        --model Qwen/Qwen3-30B-A3B \
        --tp 4 \
        --max-model-len 4096 \
        --num-prompts 20 \
        --max-new-tokens 50
"""

from __future__ import annotations

import argparse
import asyncio
import logging
import os
import uuid
from dataclasses import dataclass, field

import numpy as np

from vllm.engine.arg_utils import AsyncEngineArgs

logger = logging.getLogger(__name__)

DEFAULT_MODEL = "Qwen/Qwen3-30B-A3B"

TEST_PROMPTS = [
    "Hello, my name is",
    "The capital of France is",
    "Explain quantum computing in simple terms:",
    "Write a Python function that sorts a list:",
    "The meaning of life is",
    "In a distant galaxy, there was a",
    "The best way to learn programming is",
    "Once upon a time in a land far away,",
    "The theory of relativity states that",
    "How does photosynthesis work?",
    "Describe the process of machine learning:",
    "What are the benefits of exercise?",
    "The history of artificial intelligence began",
    "Translate the following to French: Hello world",
    "Summarize the plot of Romeo and Juliet:",
    "What is the difference between TCP and UDP?",
    "The water cycle consists of",
    "Explain how a neural network learns:",
    "The periodic table organizes elements by",
    "Write a haiku about the ocean:",
]


@dataclass
class InferenceResult:
    """Result from a single inference run."""

    experts_list: list[np.ndarray] = field(default_factory=list)
    token_ids_list: list[list[int]] = field(default_factory=list)
    num_experts: int = 0


# ---------------------------------------------------------------------------
# Inference helpers
# ---------------------------------------------------------------------------


async def _run_async_inference(
    engine_args: AsyncEngineArgs,
    prompts: list[str],
    max_new_tokens: int,
) -> InferenceResult:
    """Run inference using AsyncLLM."""
    from vllm.sampling_params import SamplingParams
    from vllm.v1.engine.async_llm import AsyncLLM

    engine = AsyncLLM.from_engine_args(engine_args)

    hf_config = engine.model_config.hf_text_config
    num_experts: int = getattr(hf_config, "num_experts", 0) or getattr(
        hf_config, "num_local_experts", 0
    )
    assert num_experts > 0, "Could not determine num_experts from model config"

    sampling_params = SamplingParams(
        temperature=0,
        max_tokens=max_new_tokens,
    )

    async def _generate_one(prompt: str, idx: int):
        request_id = str(uuid.uuid4())
        final_output = None
        async for output in engine.generate(prompt, sampling_params, request_id):
            final_output = output
        assert final_output is not None

        completion = final_output.outputs[0]
        routed = completion.routed_experts
        num_prompt_tokens = len(final_output.prompt_token_ids)
        num_generated_tokens = len(completion.token_ids)
        expected_len = num_prompt_tokens + num_generated_tokens - 1
        assert routed is not None, f"Prompt {idx}: routed_experts is None"
        assert routed.shape[0] == expected_len, (
            f"Prompt {idx}: routed_experts length {routed.shape[0]} != "
            f"prompt ({num_prompt_tokens}) + generated ({num_generated_tokens})"
            f" - 1 = {expected_len}"
        )
        return idx, routed, list(completion.token_ids)

    tasks = [_generate_one(p, i) for i, p in enumerate(prompts)]
    outputs = await asyncio.gather(*tasks)

    # Sort by original index to maintain prompt order
    outputs.sort(key=lambda x: x[0])

    result = InferenceResult(num_experts=num_experts)
    for _, routed, token_ids in outputs:
        result.experts_list.append(routed)
        result.token_ids_list.append(token_ids)

    engine.shutdown()
    return result


def run_inference(
    model: str,
    prompts: list[str],
    max_new_tokens: int = 50,
    tp: int = 1,
    max_model_len: int = 4096,
) -> InferenceResult:
    """Run inference with routed experts capture enabled via AsyncLLM."""
    engine_args = AsyncEngineArgs(
        model=model,
        enable_return_routed_experts=True,
        tensor_parallel_size=tp,
        max_model_len=max_model_len,
        disable_log_stats=True,
        attention_backend="FLASH_ATTN",
    )

    result = asyncio.run(_run_async_inference(engine_args, prompts, max_new_tokens))

    from vllm.platforms import current_platform

    if current_platform.is_cuda_alike():
        current_platform.empty_cache()

    return result


# ---------------------------------------------------------------------------
# Validation helpers
# ---------------------------------------------------------------------------


def validate_expert_ids(
    experts_list: list[np.ndarray],
    num_experts: int,
) -> None:
    """Check that all expert IDs are within valid range [0, num_experts)."""
    for i, experts in enumerate(experts_list):
        assert np.all(experts >= 0), (
            f"Prompt {i}: negative expert IDs found, min={experts.min()}"
        )
        assert np.all(experts < num_experts), (
            f"Prompt {i}: expert ID out of range [0, {num_experts}), "
            f"max={experts.max()}"
        )


def validate_shapes(experts_list: list[np.ndarray]) -> None:
    """Check that all routed_experts arrays have at least 2 dimensions."""
    for i, experts in enumerate(experts_list):
        assert experts.ndim >= 2, (
            f"Prompt {i}: expected at least 2D array, got shape {experts.shape}"
        )
        logger.info("Prompt %d: routed_experts shape = %s", i, experts.shape)


# ---------------------------------------------------------------------------
# Comparison helpers
# ---------------------------------------------------------------------------


def compare_token_ids(
    baseline: list[list[int]],
    reference: list[list[int]],
) -> float:
    """Compare token IDs from two runs. Returns mismatch ratio."""
    assert len(baseline) == len(reference), (
        f"Length mismatch: {len(baseline)} vs {len(reference)}"
    )

    total_tokens = 0
    total_mismatches = 0

    for i, (base, ref) in enumerate(zip(baseline, reference)):
        min_len = min(len(base), len(ref))
        max_len = max(len(base), len(ref))
        matches = 0
        for a, b in zip(base[:min_len], ref[:min_len]):
            if a != b:
                break
            matches += 1

        total_mismatches += max_len - matches
        total_tokens += max_len

        if matches < min_len or len(base) != len(ref):
            print(
                f"  Prompt {i}: token_ids len={len(base)} vs {len(ref)}, "
                f"mismatches={max_len - matches}/{max_len}"
            )

    if total_tokens == 0:
        raise ValueError("No tokens to compare")

    mismatch_ratio = total_mismatches / total_tokens
    print(
        f"Token ID mismatches: {total_mismatches}/{total_tokens} ({mismatch_ratio:.4%})"
    )
    return mismatch_ratio


def compare_routed_experts(
    baseline: list[np.ndarray],
    reference: list[np.ndarray],
    threshold: float = 0.05,
) -> float:
    """Compare two runs of routed experts. Returns mismatch ratio.

    Raises AssertionError if ratio exceeds threshold.
    """
    assert len(baseline) == len(reference), (
        f"Length mismatch: {len(baseline)} vs {len(reference)}"
    )

    total_elements = 0
    total_mismatches = 0

    for i, (base, ref) in enumerate(zip(baseline, reference)):
        min_len = min(len(base), len(ref))
        max_len = max(len(base), len(ref))
        if min_len == 0:
            continue

        base_trimmed = base[:min_len]
        ref_trimmed = ref[:min_len]

        matches = 0
        for a, b in zip(base_trimmed, ref_trimmed):
            if a.sum() != b.sum():
                break
            matches += 1

        total_mismatches += max_len - matches
        total_elements += max_len

        if matches < min_len or len(base) != len(ref):
            print(
                f"  Prompt {i}: routed_experts len={len(base)} vs {len(ref)}, "
                f"mismatches={max_len - matches}/{max_len}"
            )

    if total_elements == 0:
        raise ValueError("No elements to compare")

    mismatch_ratio = total_mismatches / total_elements
    print(
        f"Routed experts mismatches: {total_mismatches}/{total_elements} "
        f"({mismatch_ratio:.4%})"
    )

    assert mismatch_ratio < threshold, (
        f"Too many mismatches: {total_mismatches}/{total_elements} "
        f"({mismatch_ratio:.4%}) exceeds threshold {threshold:.4%}"
    )

    return mismatch_ratio


# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------


def main():
    os.environ.setdefault("VLLM_BATCH_INVARIANT", "1")

    parser = argparse.ArgumentParser(
        description="Test routed experts capture for MoE models"
    )
    parser.add_argument("--model", type=str, default=DEFAULT_MODEL)
    parser.add_argument("--tp", type=int, default=1)
    parser.add_argument("--max-model-len", type=int, default=4096)
    parser.add_argument("--num-prompts", type=int, default=20)
    parser.add_argument("--max-new-tokens", type=int, default=50)
    parser.add_argument(
        "--deterministic",
        action="store_true",
        help="Run twice and compare results for determinism check",
    )
    parser.add_argument(
        "--threshold",
        type=float,
        default=0.05,
        help="Maximum allowed mismatch ratio for determinism check",
    )
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    prompts = TEST_PROMPTS[: args.num_prompts]

    print(f"Model: {args.model}")
    print(f"TP: {args.tp}")
    print(f"Prompts: {len(prompts)}")
    print(f"Max new tokens: {args.max_new_tokens}")
    print()

    print("=== Run 1 (baseline) ===")
    baseline = run_inference(
        model=args.model,
        prompts=prompts,
        max_new_tokens=args.max_new_tokens,
        tp=args.tp,
        max_model_len=args.max_model_len,
    )
    print(f"num_experts (from model config): {baseline.num_experts}")

    print("\n=== Validation ===")
    validate_shapes(baseline.experts_list)
    validate_expert_ids(baseline.experts_list, num_experts=baseline.num_experts)
    print(f"All {len(baseline.experts_list)} results passed validation.")

    for i, experts in enumerate(baseline.experts_list):
        print(
            f"  Prompt {i}: shape={experts.shape}, "
            f"min={experts.min()}, max={experts.max()}"
        )

    if args.deterministic:
        print("\n=== Run 2 (reference) ===")
        reference = run_inference(
            model=args.model,
            prompts=prompts,
            max_new_tokens=args.max_new_tokens,
            tp=args.tp,
            max_model_len=args.max_model_len,
        )

        print("\n=== Determinism Check ===")
        validate_expert_ids(reference.experts_list, num_experts=baseline.num_experts)

        print("\n--- Token IDs ---")
        token_mismatch = compare_token_ids(
            baseline.token_ids_list, reference.token_ids_list
        )

        print("\n--- Routed Experts ---")
        expert_mismatch = compare_routed_experts(
            baseline.experts_list,
            reference.experts_list,
            threshold=args.threshold,
        )

        print(
            f"\nDeterminism check passed. "
            f"Token mismatch: {token_mismatch:.4%}, "
            f"Expert mismatch: {expert_mismatch:.4%}"
        )

    print("\nAll tests passed!")


if __name__ == "__main__":
    main()