Add NeuronxDistributedInference support, Speculative Decoding, Dynamic on-device sampling (#16357)

Signed-off-by: Satyajith Chilappagari <satchill@amazon.com>
Co-authored-by: Aaron Dou <yzdou@amazon.com>
Co-authored-by: Shashwat Srijan <sssrijan@amazon.com>
Co-authored-by: Chongming Ni <chongmni@amazon.com>
Co-authored-by: Amulya Ballakur <amulyaab@amazon.com>
Co-authored-by: Patrick Lange <patlange@amazon.com>
Co-authored-by: Elaine Zhao <elaineyz@amazon.com>
Co-authored-by: Lin Lin Pan <tailinpa@amazon.com>
Co-authored-by: Navyadhara Gogineni <navyadha@amazon.com>
Co-authored-by: Yishan McNabb <yishanm@amazon.com>
Co-authored-by: Mrinal Shukla <181322398+mrinalks@users.noreply.github.com>
This commit is contained in:
Satyajith Chilappagari
2025-05-07 00:07:30 -07:00
committed by GitHub
parent ba7703e659
commit 043e4c4955
15 changed files with 1622 additions and 101 deletions

View File

@@ -2273,6 +2273,9 @@ class SpeculativeConfig:
"""Scaling factor for entropy-based threshold, applied when using
`TypicalAcceptanceSampler`."""
speculative_token_tree: Optional[str] = None
"""Specifies the tree structure for speculative token generation.
"""
# required configuration params passed from engine
target_model_config: ModelConfig = field(default=None,
init=True) # type: ignore
@@ -2447,10 +2450,11 @@ class SpeculativeConfig:
"Chunked prefill and EAGLE are not compatible "
"when using V0.")
from vllm.platforms import current_platform
from vllm.transformers_utils.configs.eagle import (
EAGLEConfig)
if isinstance(self.draft_model_config.hf_config,
EAGLEConfig):
EAGLEConfig) or current_platform.is_neuron():
pass
else:
eagle_config = EAGLEConfig(