[Core][Performance] Add XGrammar support for guided decoding and set it as default (#10785)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Signed-off-by: mgoin <michael@neuralmagic.com> Co-authored-by: mgoin <michael@neuralmagic.com>
2024-12-03 02:17:00 -05:00
parent 3257d449fa
commit 9323a3153b
11 changed files with 385 additions and 33 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1789,15 +1789,15 @@ class PoolerConfig:

    step_tag_id: Optional[int] = None
    """
-    If set, only the score corresponding to the ``step_tag_id`` in the 
+    If set, only the score corresponding to the ``step_tag_id`` in the
    generated sentence should be returned. Otherwise, the scores for all tokens
    are returned.
    """

    returned_token_ids: Optional[List[int]] = None
    """
-    A list of indices for the vocabulary dimensions to be extracted, 
-    such as the token IDs of ``good_token`` and ``bad_token`` in the 
+    A list of indices for the vocabulary dimensions to be extracted,
+    such as the token IDs of ``good_token`` and ``bad_token`` in the
    ``math-shepherd-mistral-7b-prm`` model.
    """

@@ -2031,11 +2031,12 @@ def get_served_model_name(model: str,
 class DecodingConfig:
    """Dataclass which contains the decoding strategy of the engine"""

-    # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer'
-    guided_decoding_backend: str = 'outlines'
+    # Which guided decoding algo to use.
+    # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
+    guided_decoding_backend: str = 'xgrammar'

    def __post_init__(self):
-        valid_guided_backends = ['outlines', 'lm-format-enforcer']
+        valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
        backend = self.guided_decoding_backend
        if backend not in valid_guided_backends:
            raise ValueError(f"Invalid guided_decoding_backend '{backend},"
@@ -2222,7 +2223,7 @@ class CompilationConfig(BaseModel):
            from Python, functions can also be passed directly via Python object
            constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`
        - custom inductor passes: see PassConfig for more details
-    
+
    Why we have different sizes for cudagraph and inductor:
    - cudagraph: a cudagraph captured for a specific size can only be used
        for the same size. We need to capture all the sizes we want to use.