[Core][Performance] Add XGrammar support for guided decoding and set it as default (#10785)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: mgoin <michael@neuralmagic.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
This commit is contained in:
Aaron Pham
2024-12-03 02:17:00 -05:00
committed by GitHub
parent 3257d449fa
commit 9323a3153b
11 changed files with 385 additions and 33 deletions

View File

@@ -1,3 +1,4 @@
import copy
import time
from collections import Counter as collectionsCounter
from collections import deque
@@ -1024,9 +1025,9 @@ class LLMEngine:
This function updates num_computed_tokens for prompt sequences
when Multi-Step is enabled.
seq_group: SequenceGroup to update the num_computed_tokens for.
seq_group: SequenceGroup to update the num_computed_tokens for.
seq_group_meta: Metadata of the given SequenceGroup.
is_first_step_output: Optional[bool] -
is_first_step_output: Optional[bool] -
When available, is_first_step_output indicates if the appended
output token is the output of the first-step in multi-step.
A value of None indicates that outputs from all steps in
@@ -2036,7 +2037,11 @@ class LLMEngine:
logits_processors = []
if (guided_decoding := sampling_params.guided_decoding) is not None:
if sampling_params.guided_decoding is not None:
# Defensively copy sampling params since guided decoding logits
# processors can have different state for each request
sampling_params = copy.copy(sampling_params)
guided_decoding = sampling_params.guided_decoding
logger.debug(
"Building guided decoding logits processor in "
@@ -2047,7 +2052,9 @@ class LLMEngine:
self.decoding_config.guided_decoding_backend
processor = get_local_guided_decoding_logits_processor(
guided_params=guided_decoding, tokenizer=tokenizer)
guided_params=guided_decoding,
tokenizer=tokenizer,
model_config=self.model_config)
if processor:
logits_processors.append(processor)