[Spec Decode] Disable Log Prob serialization to CPU for spec decoding for both draft and target models. (#6485)

This commit is contained in:
sroy745
2024-07-20 23:58:58 -07:00
committed by GitHub
parent d7f4178dd9
commit 14f91fe67c
8 changed files with 333 additions and 64 deletions

View File

@@ -894,6 +894,7 @@ class SpeculativeConfig:
draft_token_acceptance_method: str,
typical_acceptance_sampler_posterior_threshold: Optional[float],
typical_acceptance_sampler_posterior_alpha: Optional[float],
disable_logprobs: Optional[bool],
) -> Optional["SpeculativeConfig"]:
"""Create a SpeculativeConfig if possible, else return None.
@@ -943,6 +944,11 @@ class SpeculativeConfig:
typical_acceptance_sampler_posterior_alpha (Optional[float]):
A scaling factor for the entropy-based threshold in the
TypicalAcceptanceSampler.
disable_logprobs (Optional[bool]): If set to True, token log
probabilities are not returned during speculative decoding.
If set to False, token log probabilities are returned
according to the log probability settings in SamplingParams.
If not specified, it defaults to True.
Returns:
Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
@@ -1055,6 +1061,8 @@ class SpeculativeConfig:
typical_acceptance_sampler_posterior_threshold = 0.09
if typical_acceptance_sampler_posterior_alpha is None:
typical_acceptance_sampler_posterior_alpha = 0.3
if disable_logprobs is None:
disable_logprobs = True
return SpeculativeConfig(
draft_model_config,
@@ -1068,6 +1076,7 @@ class SpeculativeConfig:
typical_acceptance_sampler_posterior_threshold,
typical_acceptance_sampler_posterior_alpha=\
typical_acceptance_sampler_posterior_alpha,
disable_logprobs=disable_logprobs
)
@staticmethod
@@ -1152,6 +1161,7 @@ class SpeculativeConfig:
draft_token_acceptance_method: str,
typical_acceptance_sampler_posterior_threshold: float,
typical_acceptance_sampler_posterior_alpha: float,
disable_logprobs: bool,
):
"""Create a SpeculativeConfig object.
@@ -1178,6 +1188,12 @@ class SpeculativeConfig:
typical_acceptance_sampler_posterior_alpha (Optional[float]):
A scaling factor for the entropy-based threshold in the
TypicalAcceptanceSampler.
disable_logprobs: If set to True, token log probabilities will not
be returned even if requested by sampling parameters. This
reduces latency by skipping logprob calculation in proposal
sampling, target sampling, and after accepted tokens are
determined. If set to False, log probabilities will be
returned.
"""
self.draft_model_config = draft_model_config
self.draft_parallel_config = draft_parallel_config
@@ -1191,6 +1207,7 @@ class SpeculativeConfig:
typical_acceptance_sampler_posterior_threshold
self.typical_acceptance_sampler_posterior_alpha = \
typical_acceptance_sampler_posterior_alpha
self.disable_logprobs = disable_logprobs
self._verify_args()