[Core][Performance] Add XGrammar support for guided decoding and set it as default (#10785)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Signed-off-by: mgoin <michael@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com>
This commit is contained in:
@@ -1789,15 +1789,15 @@ class PoolerConfig:
|
||||
|
||||
step_tag_id: Optional[int] = None
|
||||
"""
|
||||
If set, only the score corresponding to the ``step_tag_id`` in the
|
||||
If set, only the score corresponding to the ``step_tag_id`` in the
|
||||
generated sentence should be returned. Otherwise, the scores for all tokens
|
||||
are returned.
|
||||
"""
|
||||
|
||||
returned_token_ids: Optional[List[int]] = None
|
||||
"""
|
||||
A list of indices for the vocabulary dimensions to be extracted,
|
||||
such as the token IDs of ``good_token`` and ``bad_token`` in the
|
||||
A list of indices for the vocabulary dimensions to be extracted,
|
||||
such as the token IDs of ``good_token`` and ``bad_token`` in the
|
||||
``math-shepherd-mistral-7b-prm`` model.
|
||||
"""
|
||||
|
||||
@@ -2031,11 +2031,12 @@ def get_served_model_name(model: str,
|
||||
class DecodingConfig:
|
||||
"""Dataclass which contains the decoding strategy of the engine"""
|
||||
|
||||
# Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer'
|
||||
guided_decoding_backend: str = 'outlines'
|
||||
# Which guided decoding algo to use.
|
||||
# 'outlines' / 'lm-format-enforcer' / 'xgrammar'
|
||||
guided_decoding_backend: str = 'xgrammar'
|
||||
|
||||
def __post_init__(self):
|
||||
valid_guided_backends = ['outlines', 'lm-format-enforcer']
|
||||
valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
|
||||
backend = self.guided_decoding_backend
|
||||
if backend not in valid_guided_backends:
|
||||
raise ValueError(f"Invalid guided_decoding_backend '{backend},"
|
||||
@@ -2222,7 +2223,7 @@ class CompilationConfig(BaseModel):
|
||||
from Python, functions can also be passed directly via Python object
|
||||
constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`
|
||||
- custom inductor passes: see PassConfig for more details
|
||||
|
||||
|
||||
Why we have different sizes for cudagraph and inductor:
|
||||
- cudagraph: a cudagraph captured for a specific size can only be used
|
||||
for the same size. We need to capture all the sizes we want to use.
|
||||
|
||||
Reference in New Issue
Block a user