[Feature] [Spec decode]: Enable MLPSpeculator/Medusa and prompt_logprobs with ChunkedPrefill (#10132)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: wallashss <wallashss@ibm.com>
Co-authored-by: wallashss <wallashss@ibm.com>
This commit is contained in:
Nicolò Lucchesi
2025-01-27 22:38:35 +01:00
committed by GitHub
parent 2bc3fbba0c
commit 6116ca8cd7
16 changed files with 468 additions and 165 deletions

View File

@@ -1685,7 +1685,8 @@ class SpeculativeConfig:
raise ValueError("Expect the batch size threshold of disabling "
"speculative decoding is > 1, but got "
f"{speculative_disable_by_batch_size=}")
if (enable_chunked_prefill and speculative_model == "eagle"):
raise ValueError("Chunked prefill and EAGLE are not compatible.")
# TODO: The user should be able to specify revision/max model len
# for the draft model. It is not currently supported.
draft_revision = None
@@ -1752,12 +1753,6 @@ class SpeculativeConfig:
f"num_speculative_tokens={n_predict}, but "
f"{num_speculative_tokens=} was provided.")
if enable_chunked_prefill and draft_hf_config.model_type in (
"medusa", "mlp_speculator", "eagle"):
raise ValueError(
"Chunked prefill and hidden-state based draft models are "
"not compatible.")
speculative_draft_tensor_parallel_size = \
SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
target_parallel_config,