[BugFix] Support online dense model DP without overhead (#30739)

Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: njhill <nickhill123@gmail.com>
2026-01-02 07:36:38 -08:00
parent 08f425bad1
commit bd877162eb
20 changed files with 345 additions and 146 deletions
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -642,7 +642,7 @@ class ModelConfig:
        cls = "Transformers"
        # If 'hf_config != hf_text_config' it's a nested config, i.e. multimodal
        cls += "MultiModal" if self.hf_config != self.hf_text_config else ""
-        cls += "MoE" if self.get_num_experts() > 1 else ""
+        cls += "MoE" if self.is_moe else ""
        # Check if the architecture we're wrapping has defaults
        runner = None
        task = None
@@ -1001,8 +1001,7 @@ class ModelConfig:
            self.enforce_eager = True

    def _verify_with_expert_parallelism(self) -> None:
-        num_experts = self.get_num_experts()
-        if num_experts < 1:
+        if not self.is_moe:
            raise ValueError(
                "Number of experts in the model must be greater than 0 "
                "when expert parallelism is enabled."
@@ -1797,11 +1796,11 @@ class ModelConfig:
                logger.debug("Generative models support prefix caching.")
                return True

-    def is_model_moe(
-        self,
-    ) -> bool:
-        return self.get_num_experts() > 1
+    @property
+    def is_moe(self) -> bool:
+        return self.get_num_experts() > 0

+    @property
    def is_quantized(self) -> bool:
        return getattr(self.hf_config, "quantization_config", None) is not None