From 821fde2df470e732bb2061daf1e8ef9838d7cce6 Mon Sep 17 00:00:00 2001 From: Karan Bansal Date: Sat, 14 Mar 2026 22:59:06 +0530 Subject: [PATCH] [Bugfix] Fix xgrammar dtype mismatch on macOS CPU inference (#32384) Signed-off-by: Karan Bansal Co-authored-by: Inokinoki --- vllm/v1/structured_output/utils.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index aadd057b1..0d31363cb 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -116,7 +116,18 @@ def apply_grammar_bitmask( ) index_tensor = index_tensor.to(logits.device, non_blocking=True) - xgr.apply_token_bitmask_inplace(logits, grammar_bitmask, indices=index_tensor) + # Handle dtype conversion for CPU (older xgrammar CPU kernels require float32) + # See: https://github.com/vllm-project/vllm/issues/31901 + if logits.device.type == "cpu" and logits.dtype != torch.float32: + # Convert to float32, apply bitmask, then convert back + logits_float32 = logits.to(torch.float32) + xgr.apply_token_bitmask_inplace( + logits_float32, grammar_bitmask, indices=index_tensor + ) + # Copy the modified values back to the original tensor + logits.copy_(logits_float32.to(logits.dtype)) + else: + xgr.apply_token_bitmask_inplace(logits, grammar_bitmask, indices=index_tensor) class OutlinesVocabulary: