[Spec Decode] Disable Log Prob serialization to CPU for spec decoding for both draft and target models. (#6485)
This commit is contained in:
@@ -894,6 +894,7 @@ class SpeculativeConfig:
|
||||
draft_token_acceptance_method: str,
|
||||
typical_acceptance_sampler_posterior_threshold: Optional[float],
|
||||
typical_acceptance_sampler_posterior_alpha: Optional[float],
|
||||
disable_logprobs: Optional[bool],
|
||||
) -> Optional["SpeculativeConfig"]:
|
||||
"""Create a SpeculativeConfig if possible, else return None.
|
||||
|
||||
@@ -943,6 +944,11 @@ class SpeculativeConfig:
|
||||
typical_acceptance_sampler_posterior_alpha (Optional[float]):
|
||||
A scaling factor for the entropy-based threshold in the
|
||||
TypicalAcceptanceSampler.
|
||||
disable_logprobs (Optional[bool]): If set to True, token log
|
||||
probabilities are not returned during speculative decoding.
|
||||
If set to False, token log probabilities are returned
|
||||
according to the log probability settings in SamplingParams.
|
||||
If not specified, it defaults to True.
|
||||
|
||||
Returns:
|
||||
Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
|
||||
@@ -1055,6 +1061,8 @@ class SpeculativeConfig:
|
||||
typical_acceptance_sampler_posterior_threshold = 0.09
|
||||
if typical_acceptance_sampler_posterior_alpha is None:
|
||||
typical_acceptance_sampler_posterior_alpha = 0.3
|
||||
if disable_logprobs is None:
|
||||
disable_logprobs = True
|
||||
|
||||
return SpeculativeConfig(
|
||||
draft_model_config,
|
||||
@@ -1068,6 +1076,7 @@ class SpeculativeConfig:
|
||||
typical_acceptance_sampler_posterior_threshold,
|
||||
typical_acceptance_sampler_posterior_alpha=\
|
||||
typical_acceptance_sampler_posterior_alpha,
|
||||
disable_logprobs=disable_logprobs
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -1152,6 +1161,7 @@ class SpeculativeConfig:
|
||||
draft_token_acceptance_method: str,
|
||||
typical_acceptance_sampler_posterior_threshold: float,
|
||||
typical_acceptance_sampler_posterior_alpha: float,
|
||||
disable_logprobs: bool,
|
||||
):
|
||||
"""Create a SpeculativeConfig object.
|
||||
|
||||
@@ -1178,6 +1188,12 @@ class SpeculativeConfig:
|
||||
typical_acceptance_sampler_posterior_alpha (Optional[float]):
|
||||
A scaling factor for the entropy-based threshold in the
|
||||
TypicalAcceptanceSampler.
|
||||
disable_logprobs: If set to True, token log probabilities will not
|
||||
be returned even if requested by sampling parameters. This
|
||||
reduces latency by skipping logprob calculation in proposal
|
||||
sampling, target sampling, and after accepted tokens are
|
||||
determined. If set to False, log probabilities will be
|
||||
returned.
|
||||
"""
|
||||
self.draft_model_config = draft_model_config
|
||||
self.draft_parallel_config = draft_parallel_config
|
||||
@@ -1191,6 +1207,7 @@ class SpeculativeConfig:
|
||||
typical_acceptance_sampler_posterior_threshold
|
||||
self.typical_acceptance_sampler_posterior_alpha = \
|
||||
typical_acceptance_sampler_posterior_alpha
|
||||
self.disable_logprobs = disable_logprobs
|
||||
|
||||
self._verify_args()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user