Upgrade xgrammar to 0.1.23 (#22988)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
Russell Bryant
2025-09-02 22:32:59 -04:00
committed by GitHub
parent 42dc59dbac
commit e32a0e8678
2 changed files with 2 additions and 9 deletions

View File

@@ -90,15 +90,11 @@ from .utils import (AttentionGroup, MultiModalBudget,
if TYPE_CHECKING:
import xgrammar as xgr
import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile # noqa: E501
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from vllm.v1.core.sched.output import SchedulerOutput
else:
xgr = LazyLoader("xgr", globals(), "xgrammar")
xgr_torch_compile = LazyLoader(
"xgr_torch_compile", globals(),
"xgrammar.kernels.apply_token_bitmask_inplace_torch_compile")
logger = init_logger(__name__)
@@ -1333,10 +1329,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# so we receive it in that format.
grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous()
# Force use of the torch.compile implementation from xgrammar to work
# around issues with the Triton kernel in concurrent structured output
# scenarios. See PR #19565 and issues #19493, #18376 for details.
xgr_torch_compile.apply_token_bitmask_inplace_torch_compile(
xgr.apply_token_bitmask_inplace(
logits,
grammar_bitmask.to(self.device, non_blocking=True),
indices=out_indices if not skip_out_indices else None,