[Feature] Support recording expert indices for rollout router replay (#28284)

Signed-off-by: xhx1022 <1737006628@qq.com> Signed-off-by: Hongxin Xu <70438206+xhx1022@users.noreply.github.com> Signed-off-by: arlenxu <arlenxu@tencent.com> Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> Co-authored-by: arlenxu <arlenxu@tencent.com>
2026-01-12 22:23:04 +08:00
parent 0565f1fdec
commit 49e6b86c91
11 changed files with 463 additions and 3 deletions
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -158,6 +158,7 @@ class LLM:
        enforce_eager: Whether to enforce eager execution. If True, we will
            disable CUDA graph and always execute the model in eager mode.
            If False, we will use CUDA graph and eager execution in hybrid.
+        enable_return_routed_experts: Whether to return routed experts.
        disable_custom_all_reduce: See
            [ParallelConfig][vllm.config.ParallelConfig].
        hf_token: The token to use as HTTP bearer authorization for remote files
@@ -209,6 +210,7 @@ class LLM:
        swap_space: float = 4,
        cpu_offload_gb: float = 0,
        enforce_eager: bool = False,
+        enable_return_routed_experts: bool = False,
        disable_custom_all_reduce: bool = False,
        hf_token: bool | str | None = None,
        hf_overrides: HfOverrides | None = None,
@@ -317,6 +319,7 @@ class LLM:
            swap_space=swap_space,
            cpu_offload_gb=cpu_offload_gb,
            enforce_eager=enforce_eager,
+            enable_return_routed_experts=enable_return_routed_experts,
            disable_custom_all_reduce=disable_custom_all_reduce,
            hf_token=hf_token,
            hf_overrides=hf_overrides,