[V1][TPU] Remove unnecessary padding for running on TPU. (#14467)

2025-03-08 18:56:04 -08:00
parent b0d541947a
commit 10f7552789
2 changed files with 6 additions and 18 deletions
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -12,8 +12,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
 from vllm.attention.backends.utils import CommonAttentionState

 # These are the 2 tunable parameters of the paged attention Pallas kernel.
-NUM_QUERIES_PER_BLOCK = 16
-NUM_KV_PAGES_PER_BLOCK = 256
+NUM_QUERIES_PER_BLOCK = 32
+NUM_KV_PAGES_PER_BLOCK = 128


 class PallasAttentionBackend(AttentionBackend):