[V1] Prevent xgrammar from breaking TPU support (#14575)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-10 19:06:19 -04:00
parent 432d6dad15
commit 04421dff8a
2 changed files with 11 additions and 2 deletions
--- a/vllm/v1/structured_output/init.py
+++ b/vllm/v1/structured_output/init.py
@@ -17,6 +17,7 @@ from vllm.v1.structured_output.grammar import (Grammar, StructuredOutputKey,
 if TYPE_CHECKING:
    import numpy as np
    import numpy.typing as npt
+    import torch
    import xgrammar as xgr

    from vllm.v1.request import Request
@@ -53,8 +54,7 @@ class StructuredOutputManager:
        # compilation, so we set it to half the number of CPUs.
        max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
        self.executor = ThreadPoolExecutor(max_workers=max_workers)
-        self._grammar_bitmask = xgr.allocate_token_bitmask(
-            self.vllm_config.scheduler_config.max_num_seqs, self.vocab_size)
+        self._grammar_bitmask: Optional[torch.Tensor] = None

    def __getitem__(self, key: StructuredOutputKey) -> Optional[Grammar]:
        # We need to pop and re-insert the grammar here for LRU cache
@@ -134,6 +134,11 @@ class StructuredOutputManager:
        if not structured_output_request_ids:
            return None

+        if self._grammar_bitmask is None:
+            self._grammar_bitmask = xgr.allocate_token_bitmask(
+                self.vllm_config.scheduler_config.max_num_seqs,
+                self.vocab_size)
+
        # Fill the bitmask using the index of each request equal to its
        # position in the batch. Resize the bitmask down to the size of
        # the batch.