Add NeuronxDistributedInference support, Speculative Decoding, Dynamic on-device sampling (#16357)

Signed-off-by: Satyajith Chilappagari <satchill@amazon.com> Co-authored-by: Aaron Dou <yzdou@amazon.com> Co-authored-by: Shashwat Srijan <sssrijan@amazon.com> Co-authored-by: Chongming Ni <chongmni@amazon.com> Co-authored-by: Amulya Ballakur <amulyaab@amazon.com> Co-authored-by: Patrick Lange <patlange@amazon.com> Co-authored-by: Elaine Zhao <elaineyz@amazon.com> Co-authored-by: Lin Lin Pan <tailinpa@amazon.com> Co-authored-by: Navyadhara Gogineni <navyadha@amazon.com> Co-authored-by: Yishan McNabb <yishanm@amazon.com> Co-authored-by: Mrinal Shukla <181322398+mrinalks@users.noreply.github.com>
2025-05-07 00:07:30 -07:00
parent ba7703e659
commit 043e4c4955
15 changed files with 1622 additions and 101 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2273,6 +2273,9 @@ class SpeculativeConfig:
    """Scaling factor for entropy-based threshold, applied when using
    `TypicalAcceptanceSampler`."""

+    speculative_token_tree: Optional[str] = None
+    """Specifies the tree structure for speculative token generation. 
+    """
    # required configuration params passed from engine
    target_model_config: ModelConfig = field(default=None,
                                             init=True)  # type: ignore
@@ -2447,10 +2450,11 @@ class SpeculativeConfig:
                            "Chunked prefill and EAGLE are not compatible "
                            "when using V0.")

+                    from vllm.platforms import current_platform
                    from vllm.transformers_utils.configs.eagle import (
                        EAGLEConfig)
                    if isinstance(self.draft_model_config.hf_config,
-                                  EAGLEConfig):
+                                  EAGLEConfig) or current_platform.is_neuron():
                        pass
                    else:
                        eagle_config = EAGLEConfig(