Use the optimized block sizes after tuning the kernel. (#14329)

This commit is contained in:
iefgnoix
2025-03-07 05:25:13 -08:00
committed by GitHub
parent f7a6bd0fa1
commit 1e3598edeb

View File

@@ -12,8 +12,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.backends.utils import CommonAttentionState
# These are the 2 tunable parameters of the paged attention Pallas kernel. # These are the 2 tunable parameters of the paged attention Pallas kernel.
NUM_QUERIES_PER_BLOCK = 32 NUM_QUERIES_PER_BLOCK = 16
NUM_KV_PAGES_PER_BLOCK = 128 NUM_KV_PAGES_PER_BLOCK = 256
class PallasAttentionBackend(AttentionBackend): class PallasAttentionBackend(AttentionBackend):