[torch.compile] Speed up MOE handling in forward_context (#33184)

Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-01-27 18:17:54 -05:00
parent 3a6d5cbefd
commit d9aa39a3bb
4 changed files with 22 additions and 18 deletions
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -597,6 +597,10 @@ class CompilationConfig:
    Map from layer name to layer objects that need to be accessed outside
    model code, e.g., Attention, FusedMOE when dp_size>1."""

+    static_all_moe_layers: list[str] = field(default_factory=list, init=False)
+    """The names of all the MOE layers in the model
+    """
+
    # Attention ops; used for piecewise cudagraphs
    # Use PyTorch operator format: "namespace::name"
    _attention_ops: ClassVar[list[str]] = [