[Attention] Support distinguishing between short extends and decodes (#37303)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
Lucas Wilkinson
2026-03-20 10:49:36 -07:00
committed by GitHub
parent 79eb9369c5
commit e1d85e5c24
9 changed files with 176 additions and 133 deletions

View File

@@ -70,3 +70,15 @@ steps:
device: mi325_4
depends_on:
- image-build-amd
- label: V1 e2e (4xH100)
timeout_in_minutes: 60
device: h100
num_devices: 4
optional: true
source_file_dependencies:
- vllm/v1/attention/backends/utils.py
- vllm/v1/worker/gpu_model_runner.py
- tests/v1/e2e/test_hybrid_chunked_prefill.py
commands:
- pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py