[Feature] Batch-Invariant Support for FA2 and LoRA (#30018)

Signed-off-by: quanliu <18646313696@163.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
quanliu
2025-12-09 23:01:38 +08:00
committed by GitHub
parent 5c213d2899
commit 5dcd593baf
3 changed files with 23 additions and 3 deletions

View File

@@ -11,8 +11,10 @@ from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer
skip_unsupported = pytest.mark.skipif(
not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
reason="Requires CUDA and >= Hopper (SM90)",
not (current_platform.is_cuda() and current_platform.has_device_capability(80)),
# Supports testing on Ampere and Ada Lovelace devices.
# Note: For devices with SM < 90, batch invariance does not support CUDA Graphs.
reason="Requires CUDA and >= Ampere (SM80)",
)
BACKENDS: list[str] = [
@@ -97,3 +99,7 @@ def _extract_step_logprobs(request_output):
return t, inner.token_ids
return None, None
def is_device_capability_below_90() -> bool:
return not current_platform.has_device_capability(90)