[Attention] Support distinguishing between short extends and decodes (#37303)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-03-20 10:49:36 -07:00
parent 79eb9369c5
commit e1d85e5c24
9 changed files with 176 additions and 133 deletions
--- a/tests/v1/e2e/test_hybrid_chunked_prefill.py
+++ b/tests/v1/e2e/test_hybrid_chunked_prefill.py
@@ -43,7 +43,7 @@ MESSAGES = [
        pytest.param("Qwen/Qwen3.5-4B", marks=[large_gpu_mark(min_gb=40)]),
        pytest.param(
            "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8",
-            marks=[large_gpu_mark(min_gb=80)] + multi_gpu_marks(num_gpus=2),
+            marks=[large_gpu_mark(min_gb=80)] + multi_gpu_marks(num_gpus=4),
        ),
    ],
 )
@@ -68,7 +68,7 @@ def test_mtp_speculative_mixed_batch_short_prefill(
        max_num_batched_tokens=chunk_size,
        max_model_len=512,
        enforce_eager=True,
-        tensor_parallel_size=2,
+        tensor_parallel_size=4,
        trust_remote_code=True,
        enable_chunked_prefill=True,
        enable_prefix_caching=enable_prefix_caching,