[Feature] [Spec decode]: Enable MLPSpeculator/Medusa and prompt_logprobs with ChunkedPrefill (#10132)
Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: wallashss <wallashss@ibm.com> Co-authored-by: wallashss <wallashss@ibm.com>
This commit is contained in:
@@ -1685,7 +1685,8 @@ class SpeculativeConfig:
|
||||
raise ValueError("Expect the batch size threshold of disabling "
|
||||
"speculative decoding is > 1, but got "
|
||||
f"{speculative_disable_by_batch_size=}")
|
||||
|
||||
if (enable_chunked_prefill and speculative_model == "eagle"):
|
||||
raise ValueError("Chunked prefill and EAGLE are not compatible.")
|
||||
# TODO: The user should be able to specify revision/max model len
|
||||
# for the draft model. It is not currently supported.
|
||||
draft_revision = None
|
||||
@@ -1752,12 +1753,6 @@ class SpeculativeConfig:
|
||||
f"num_speculative_tokens={n_predict}, but "
|
||||
f"{num_speculative_tokens=} was provided.")
|
||||
|
||||
if enable_chunked_prefill and draft_hf_config.model_type in (
|
||||
"medusa", "mlp_speculator", "eagle"):
|
||||
raise ValueError(
|
||||
"Chunked prefill and hidden-state based draft models are "
|
||||
"not compatible.")
|
||||
|
||||
speculative_draft_tensor_parallel_size = \
|
||||
SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
|
||||
target_parallel_config,
|
||||
|
||||
Reference in New Issue
Block a user