[V1][Spec Decode] EAGLE-3 Support (#16937)

Signed-off-by: Bryan Lu <yuzhelu@amazon.com> Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai> Co-authored-by: Bryan Lu <yuzhelu@amazon.com>
2025-04-25 18:43:07 -04:00
parent 70116459c3
commit a0e619e62a
12 changed files with 358 additions and 34 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2339,9 +2339,10 @@ class SpeculativeConfig:
                )

                # Automatically detect the method
-                if self.method == 'eagle':
+                if self.method in ('eagle', 'eagle3'):
                    pass
-                elif "eagle-" in self.draft_model_config.model.lower():
+                elif "eagle-" in self.draft_model_config.model.lower() or \
+                        "eagle3-" in self.draft_model_config.model.lower():
                    self.method = "eagle"
                elif self.draft_model_config.hf_config.model_type == "medusa":
                    self.method = "medusa"
@@ -2352,7 +2353,7 @@ class SpeculativeConfig:
                    self.method = "draft_model"

                # Replace hf_config for EAGLE draft_model
-                if self.method == "eagle":
+                if self.method in ("eagle", "eagle3"):
                    if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
                        raise ValueError(
                            "Chunked prefill and EAGLE are not compatible "
@@ -2549,6 +2550,12 @@ class SpeculativeConfig:
                             "speculative decoding is > 1, but got "
                             f"{self.disable_by_batch_size=}")

+        if self.method == "eagle3" and self.target_model_config and \
+            "llama" not in self.target_model_config.hf_text_config.model_type:
+            raise ValueError(
+                "Eagle3 is only supported for Llama models. "
+                f"Got {self.target_model_config.hf_text_config.model_type=}")
+
    @property
    def num_lookahead_slots(self) -> int:
        """The number of additional slots the scheduler should allocate per