[TPU] Add TPU specific var VLLM_TPU_MOST_MODEL_LEN (#19919)

Signed-off-by: Chenyaaang <chenyangli@google.com>
This commit is contained in:
Chenyaaang
2025-06-25 15:51:02 -07:00
committed by GitHub
parent 55c65ab495
commit 2d7620c3eb
5 changed files with 184 additions and 76 deletions

View File

@@ -71,6 +71,11 @@ class PallasAttentionBackend(AttentionBackend):
min_page_size = 1 << (min_page_size - 1).bit_length()
return min_page_size
@staticmethod
def get_max_num_seqs(model_len: int, page_size: int) -> int:
num_page_per_req = cdiv(model_len, page_size)
return 1024 * 1024 // 2 // num_page_per_req // 4
# TPU has limited SREGs (scalar registers), if page_size is too small, we
# can spill SREGs easily which leads to bad performance. The strategy we
# apply here is trying to split max-model-len to 16 pages which make the