[TPU] Add TPU specific var VLLM_TPU_MOST_MODEL_LEN (#19919)

Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-25 15:51:02 -07:00
parent 55c65ab495
commit 2d7620c3eb
5 changed files with 184 additions and 76 deletions
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -71,6 +71,11 @@ class PallasAttentionBackend(AttentionBackend):
        min_page_size = 1 << (min_page_size - 1).bit_length()
        return min_page_size

+    @staticmethod
+    def get_max_num_seqs(model_len: int, page_size: int) -> int:
+        num_page_per_req = cdiv(model_len, page_size)
+        return 1024 * 1024 // 2 // num_page_per_req // 4
+
    # TPU has limited SREGs (scalar registers), if page_size is too small, we
    # can spill SREGs easily which leads to bad performance. The strategy we
    # apply here is trying to split max-model-len to 16 pages which make the