[TPU] Support sliding window and logit soft capping in the paged attention kernel for TPU. (#15732)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
This commit is contained in:
@@ -13,18 +13,24 @@ import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
|
||||
MODEL_NAMES = [
|
||||
"Qwen/Qwen2-1.5B-Instruct",
|
||||
"google/gemma-3-1b-it",
|
||||
]
|
||||
NUM_CONCURRENT = 500
|
||||
TASK = "gsm8k"
|
||||
FILTER = "exact_match,strict-match"
|
||||
RTOL = 0.03
|
||||
EXPECTED_VALUE = 0.58
|
||||
EXPECTED_VALUES = {
|
||||
"Qwen/Qwen2-1.5B-Instruct": 0.58,
|
||||
"google/gemma-3-1b-it": 0.25,
|
||||
}
|
||||
|
||||
|
||||
def run_test(more_args=None):
|
||||
def run_test(model_name, more_args=None):
|
||||
"""Run the end to end accuracy test."""
|
||||
|
||||
model_args = f"pretrained={MODEL_NAME},max_model_len=4096"
|
||||
model_args = f"pretrained={model_name},max_model_len=4096"
|
||||
|
||||
if more_args is not None:
|
||||
model_args = "{},{}".format(model_args, more_args)
|
||||
@@ -37,9 +43,12 @@ def run_test(more_args=None):
|
||||
)
|
||||
|
||||
measured_value = results["results"][TASK][FILTER]
|
||||
assert (measured_value - RTOL < EXPECTED_VALUE
|
||||
and measured_value + RTOL > EXPECTED_VALUE
|
||||
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
||||
assert model_name in EXPECTED_VALUES, (
|
||||
f"Cannot find the expected value for the model {model_name=}")
|
||||
expected_value = EXPECTED_VALUES[model_name]
|
||||
assert (measured_value - RTOL < expected_value
|
||||
and measured_value + RTOL > expected_value
|
||||
), f"Expected: {expected_value} | Measured: {measured_value}"
|
||||
|
||||
|
||||
# TODO: [AlexM] Fix it with new CI/CD tests
|
||||
@@ -49,7 +58,8 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4"
|
||||
@pytest.mark.skipif(not current_platform.is_cuda()
|
||||
and not current_platform.is_tpu(),
|
||||
reason="V1 is currently only supported on CUDA and TPU")
|
||||
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
|
||||
@pytest.mark.parametrize("model", MODEL_NAMES)
|
||||
def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
|
||||
"""Run with the V1 Engine."""
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
@@ -64,7 +74,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
|
||||
if TPU_TP_TEST_STR:
|
||||
more_args += ",{}".format(TPU_TP_TEST_STR)
|
||||
|
||||
run_test(more_args)
|
||||
run_test(model, more_args)
|
||||
|
||||
|
||||
def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
|
||||
@@ -72,4 +82,4 @@ def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
run_test()
|
||||
run_test("Qwen/Qwen2-1.5B-Instruct")
|
||||
|
||||
Reference in New Issue
Block a user