Adding optional speculator tests for larger models (#32943)
Signed-off-by: shanjiaz <zsjwpianpian@gmail.com>
This commit is contained in:
@@ -362,7 +362,7 @@ steps:
|
||||
- pytest -v -s v1/sample
|
||||
- pytest -v -s v1/logits_processors
|
||||
- pytest -v -s v1/worker
|
||||
- pytest -v -s v1/spec_decode
|
||||
- pytest -v -s -m 'not slow_test' v1/spec_decode
|
||||
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
||||
- pytest -v -s -m 'not cpu_test' v1/metrics
|
||||
- pytest -v -s v1/test_oracle.py
|
||||
@@ -1420,6 +1420,20 @@ steps:
|
||||
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
- label: Acceptance Length Test (Large Models) # optional
|
||||
timeout_in_minutes: 120
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 1
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/model_executor/models/mlp_speculator.py
|
||||
- tests/v1/spec_decode/test_acceptance_length.py
|
||||
commands:
|
||||
- export VLLM_ALLOW_INSECURE_SERIALIZATION=1
|
||||
- pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test
|
||||
|
||||
- label: LM Eval Large Models # optional
|
||||
gpu: a100
|
||||
optional: true
|
||||
|
||||
@@ -35,6 +35,10 @@ class Eagle3ModelConfig:
|
||||
id: str = ""
|
||||
# Backends that are incompatible with this model (will be skipped)
|
||||
excluded_backends: set[AttentionBackendEnum] = field(default_factory=set)
|
||||
# Pytest marks for this configuration (e.g., pytest.mark.optional)
|
||||
marks: list = field(default_factory=list)
|
||||
# Custom relative tolerance (defaults to DEFAULT_RTOL if None)
|
||||
rtol: float | None = None
|
||||
|
||||
|
||||
# Model configurations for EAGLE3 acceptance length tests.
|
||||
@@ -65,6 +69,17 @@ EAGLE3_MODEL_CONFIGS = [
|
||||
# FLASHINFER does not support ("sink setting not supported")
|
||||
excluded_backends={AttentionBackendEnum.FLASHINFER},
|
||||
),
|
||||
Eagle3ModelConfig(
|
||||
verifier="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8",
|
||||
drafter="nm-testing/Speculator-Qwen3-30B-MOE-VL-Eagle3",
|
||||
expected_acceptance_length=1.35,
|
||||
expected_acceptance_lengths_per_pos=[0.2900, 0.0620, 0.0115],
|
||||
id="qwen3-30b-moe-vl-eagle3",
|
||||
marks=[
|
||||
pytest.mark.slow_test,
|
||||
],
|
||||
rtol=0.15, # Higher tolerance due to small absolute values at position 2
|
||||
),
|
||||
]
|
||||
|
||||
# Default test parameters
|
||||
@@ -194,9 +209,16 @@ def extract_acceptance_metrics(metrics, num_spec_tokens: int) -> dict:
|
||||
|
||||
|
||||
@large_gpu_mark(min_gb=40)
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="This test is only supported on CUDA platform.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"model_config",
|
||||
[pytest.param(config, id=config.id) for config in EAGLE3_MODEL_CONFIGS],
|
||||
[
|
||||
pytest.param(config, id=config.id, marks=config.marks)
|
||||
for config in EAGLE3_MODEL_CONFIGS
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("num_spec_tokens", [DEFAULT_NUM_SPEC_TOKENS])
|
||||
@pytest.mark.parametrize("tp_size", get_tp_size_params())
|
||||
@@ -251,6 +273,7 @@ def test_eagle3_acceptance_length(
|
||||
|
||||
rel_error = abs(actual_acceptance_length - expected) / expected
|
||||
|
||||
# Overall acceptance length always uses DEFAULT_RTOL
|
||||
assert rel_error <= DEFAULT_RTOL, (
|
||||
f"Acceptance length regression detected for {model_config.id}!\n"
|
||||
f" Expected: {expected:.3f}\n"
|
||||
@@ -261,18 +284,22 @@ def test_eagle3_acceptance_length(
|
||||
)
|
||||
|
||||
if expected_per_pos and len(expected_per_pos) == len(actual_per_pos):
|
||||
# Per-position checks use model-specific rtol if provided
|
||||
rtol = (
|
||||
model_config.rtol if model_config.rtol is not None else DEFAULT_RTOL
|
||||
)
|
||||
for pos, (actual, exp) in enumerate(
|
||||
zip(actual_per_pos, expected_per_pos)
|
||||
):
|
||||
if exp > 0:
|
||||
pos_rel_error = abs(actual - exp) / exp
|
||||
assert pos_rel_error <= DEFAULT_RTOL, (
|
||||
assert pos_rel_error <= rtol, (
|
||||
f"Per-position acceptance length regression at pos {pos} "
|
||||
f"for {model_config.id}!\n"
|
||||
f" Expected: {exp:.3f}\n"
|
||||
f" Actual: {actual:.3f}\n"
|
||||
f" Relative error: {pos_rel_error:.2%} "
|
||||
f"(tolerance: {DEFAULT_RTOL:.2%})"
|
||||
f"(tolerance: {rtol:.2%})"
|
||||
)
|
||||
|
||||
print(
|
||||
|
||||
Reference in New Issue
Block a user