[V0 Deprecation] Remove V0 Spec Decode workers (#21152)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -2536,8 +2536,6 @@ class DeviceConfig:
|
||||
|
||||
SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
|
||||
"mlp_speculator", "draft_model", "deepseek_mtp"]
|
||||
SpeculativeAcceptanceMethod = Literal["rejection_sampler",
|
||||
"typical_acceptance_sampler"]
|
||||
|
||||
|
||||
@config
|
||||
@@ -2560,13 +2558,6 @@ class SpeculativeConfig:
|
||||
|
||||
If using `ngram` method, the related configuration `prompt_lookup_max` and
|
||||
`prompt_lookup_min` should be considered."""
|
||||
acceptance_method: SpeculativeAcceptanceMethod = "rejection_sampler"
|
||||
"""The method to use for accepting draft tokens:\n
|
||||
- "rejection_sampler" maps to `RejectionSampler`.\n
|
||||
- "typical_acceptance_sampler" maps to `TypicalAcceptanceSampler`.
|
||||
|
||||
If using `typical_acceptance_sampler`, the related configuration
|
||||
`posterior_threshold` and `posterior_alpha` should be considered."""
|
||||
draft_tensor_parallel_size: Optional[int] = None
|
||||
"""The degree of the tensor parallelism for the draft model. Can only be 1
|
||||
or the same as the target model's tensor parallel size."""
|
||||
@@ -2593,9 +2584,6 @@ class SpeculativeConfig:
|
||||
will use the default version."""
|
||||
|
||||
# Advanced control
|
||||
disable_mqa_scorer: bool = False
|
||||
"""Disable the MQA scorer and fall back to batch expansion for scoring
|
||||
proposals."""
|
||||
disable_by_batch_size: Optional[int] = None
|
||||
"""Disable speculative decoding for new incoming requests when the number
|
||||
of enqueued requests is larger than this value, if provided."""
|
||||
@@ -2608,16 +2596,6 @@ class SpeculativeConfig:
|
||||
"""Minimum size of ngram token window when using Ngram proposer, if
|
||||
provided. Defaults to 1."""
|
||||
|
||||
# Typical acceptance sampler configuration
|
||||
posterior_threshold: Optional[float] = None
|
||||
"""A threshold value that sets a lower bound on the posterior probability
|
||||
of a token in the target model for it to be accepted. This threshold is
|
||||
used only when we use the `TypicalAcceptanceSampler` for token acceptance.
|
||||
"""
|
||||
posterior_alpha: Optional[float] = None
|
||||
"""Scaling factor for entropy-based threshold, applied when using
|
||||
`TypicalAcceptanceSampler`."""
|
||||
|
||||
speculative_token_tree: Optional[str] = None
|
||||
"""Specifies the tree structure for speculative token generation.
|
||||
"""
|
||||
@@ -2795,8 +2773,8 @@ class SpeculativeConfig:
|
||||
elif (self.draft_model_config.hf_config.model_type ==
|
||||
"mlp_speculator"):
|
||||
self.method = "mlp_speculator"
|
||||
elif (self.draft_model_config.hf_config.model_type ==
|
||||
"deepseek_mtp"):
|
||||
elif (self.draft_model_config.hf_config.model_type
|
||||
in ("deepseek_mtp", "mimo_mtp")):
|
||||
self.method = "deepseek_mtp"
|
||||
if self.num_speculative_tokens > 1:
|
||||
logger.warning(
|
||||
@@ -2806,6 +2784,11 @@ class SpeculativeConfig:
|
||||
)
|
||||
else:
|
||||
self.method = "draft_model"
|
||||
raise NotImplementedError(
|
||||
"Speculative decoding with draft model is not "
|
||||
"supported yet. Please consider using other "
|
||||
"speculative decoding methods such as ngram, medusa, "
|
||||
"eagle, or deepseek_mtp.")
|
||||
|
||||
# Replace hf_config for EAGLE draft_model
|
||||
if self.method in ("eagle", "eagle3"):
|
||||
@@ -2864,12 +2847,6 @@ class SpeculativeConfig:
|
||||
self.target_parallel_config,
|
||||
self.draft_tensor_parallel_size))
|
||||
|
||||
if self.acceptance_method == "typical_acceptance_sampler":
|
||||
if self.posterior_threshold is None:
|
||||
self.posterior_threshold = 0.09
|
||||
if self.posterior_alpha is None:
|
||||
self.posterior_alpha = 0.3
|
||||
|
||||
@staticmethod
|
||||
def _maybe_override_draft_max_model_len(
|
||||
speculative_max_model_len: Optional[int],
|
||||
@@ -2975,30 +2952,6 @@ class SpeculativeConfig:
|
||||
if self.draft_model_config:
|
||||
self.draft_model_config.verify_with_parallel_config(
|
||||
self.draft_parallel_config)
|
||||
# Validate and set draft token acceptance related settings.
|
||||
|
||||
if self.acceptance_method is None:
|
||||
raise ValueError("acceptance_method is not set. "
|
||||
"Expected values are rejection_sampler or "
|
||||
"typical_acceptance_sampler.")
|
||||
|
||||
if (self.acceptance_method != 'rejection_sampler'
|
||||
and self.acceptance_method != 'typical_acceptance_sampler'):
|
||||
raise ValueError(
|
||||
"Expected acceptance_method to be either "
|
||||
"rejection_sampler or typical_acceptance_sampler. Instead it "
|
||||
f"is {self.acceptance_method}")
|
||||
|
||||
if self.acceptance_method == "typical_acceptance_sampler" and (
|
||||
(self.posterior_threshold is not None
|
||||
and self.posterior_threshold < 0) or
|
||||
(self.posterior_alpha is not None and self.posterior_alpha < 0)):
|
||||
raise ValueError(
|
||||
"Expected the posterior_threshold and posterior_alpha of "
|
||||
"typical_acceptance_sampler to be > 0. "
|
||||
"Instead found posterior_threshold = "
|
||||
f"{self.posterior_threshold} and posterior_alpha = "
|
||||
f"{self.posterior_alpha}")
|
||||
|
||||
if (self.disable_by_batch_size is not None
|
||||
and self.disable_by_batch_size < 2):
|
||||
|
||||
Reference in New Issue
Block a user