diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6edcb2e7d..bcd9997a4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -362,7 +362,7 @@ steps: - pytest -v -s v1/sample - pytest -v -s v1/logits_processors - pytest -v -s v1/worker - - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not slow_test' v1/spec_decode - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - pytest -v -s -m 'not cpu_test' v1/metrics - pytest -v -s v1/test_oracle.py @@ -1420,6 +1420,20 @@ steps: - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py +- label: Acceptance Length Test (Large Models) # optional + timeout_in_minutes: 120 + gpu: h100 + optional: true + num_gpus: 1 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/model_executor/models/mlp_speculator.py + - tests/v1/spec_decode/test_acceptance_length.py + commands: + - export VLLM_ALLOW_INSECURE_SERIALIZATION=1 + - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test + - label: LM Eval Large Models # optional gpu: a100 optional: true diff --git a/tests/v1/spec_decode/test_acceptance_length.py b/tests/v1/spec_decode/test_acceptance_length.py index 7276688f4..4f43c9247 100644 --- a/tests/v1/spec_decode/test_acceptance_length.py +++ b/tests/v1/spec_decode/test_acceptance_length.py @@ -35,6 +35,10 @@ class Eagle3ModelConfig: id: str = "" # Backends that are incompatible with this model (will be skipped) excluded_backends: set[AttentionBackendEnum] = field(default_factory=set) + # Pytest marks for this configuration (e.g., pytest.mark.optional) + marks: list = field(default_factory=list) + # Custom relative tolerance (defaults to DEFAULT_RTOL if None) + rtol: float | None = None # Model configurations for EAGLE3 acceptance length tests. @@ -65,6 +69,17 @@ EAGLE3_MODEL_CONFIGS = [ # FLASHINFER does not support ("sink setting not supported") excluded_backends={AttentionBackendEnum.FLASHINFER}, ), + Eagle3ModelConfig( + verifier="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8", + drafter="nm-testing/Speculator-Qwen3-30B-MOE-VL-Eagle3", + expected_acceptance_length=1.35, + expected_acceptance_lengths_per_pos=[0.2900, 0.0620, 0.0115], + id="qwen3-30b-moe-vl-eagle3", + marks=[ + pytest.mark.slow_test, + ], + rtol=0.15, # Higher tolerance due to small absolute values at position 2 + ), ] # Default test parameters @@ -194,9 +209,16 @@ def extract_acceptance_metrics(metrics, num_spec_tokens: int) -> dict: @large_gpu_mark(min_gb=40) +@pytest.mark.skipif( + not current_platform.is_cuda(), + reason="This test is only supported on CUDA platform.", +) @pytest.mark.parametrize( "model_config", - [pytest.param(config, id=config.id) for config in EAGLE3_MODEL_CONFIGS], + [ + pytest.param(config, id=config.id, marks=config.marks) + for config in EAGLE3_MODEL_CONFIGS + ], ) @pytest.mark.parametrize("num_spec_tokens", [DEFAULT_NUM_SPEC_TOKENS]) @pytest.mark.parametrize("tp_size", get_tp_size_params()) @@ -251,6 +273,7 @@ def test_eagle3_acceptance_length( rel_error = abs(actual_acceptance_length - expected) / expected + # Overall acceptance length always uses DEFAULT_RTOL assert rel_error <= DEFAULT_RTOL, ( f"Acceptance length regression detected for {model_config.id}!\n" f" Expected: {expected:.3f}\n" @@ -261,18 +284,22 @@ def test_eagle3_acceptance_length( ) if expected_per_pos and len(expected_per_pos) == len(actual_per_pos): + # Per-position checks use model-specific rtol if provided + rtol = ( + model_config.rtol if model_config.rtol is not None else DEFAULT_RTOL + ) for pos, (actual, exp) in enumerate( zip(actual_per_pos, expected_per_pos) ): if exp > 0: pos_rel_error = abs(actual - exp) / exp - assert pos_rel_error <= DEFAULT_RTOL, ( + assert pos_rel_error <= rtol, ( f"Per-position acceptance length regression at pos {pos} " f"for {model_config.id}!\n" f" Expected: {exp:.3f}\n" f" Actual: {actual:.3f}\n" f" Relative error: {pos_rel_error:.2%} " - f"(tolerance: {DEFAULT_RTOL:.2%})" + f"(tolerance: {rtol:.2%})" ) print(