[CI] Split V1 e2e + engine (1 GPU) into separate jobs (#36945)

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 14:16:02 -07:00
parent 0005d2a3c9
commit f1816fb192
18 changed files with 81 additions and 39 deletions
--- a/tests/v1/e2e/spec_decode/init.py
+++ b/tests/v1/e2e/spec_decode/init.py
--- a/tests/v1/e2e/spec_decode/test_async_spec_decode.py
+++ b/tests/v1/e2e/spec_decode/test_async_spec_decode.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test that verifies no implicit GPU-CPU synchronization occurs during
+speculative decoding generation under expected conditions.
+"""
+
+import multiprocessing
+import sys
+import traceback
+
+import pytest
+import torch
+
+
+@pytest.fixture
+def sync_tracker():
+    """
+    Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
+    lazy init syncs. Prints stack traces immediately when syncs occur.
+    """
+    from vllm.v1.attention.backend import CommonAttentionMetadata
+
+    # Shared counter for cross-process communication (inherited by fork)
+    sync_count = multiprocessing.Value("i", 0)
+
+    # Save original property
+    original_prop = CommonAttentionMetadata.seq_lens_cpu
+    original_fget = original_prop.fget
+
+    # Create tracking wrapper
+    def tracking_seq_lens_cpu(self):
+        if self._seq_lens_cpu is None:
+            # Increment counter
+            with sync_count.get_lock():
+                sync_count.value += 1
+                count = sync_count.value
+            # Print stack trace immediately (shows in subprocess output)
+            print(f"\n{'=' * 60}", file=sys.stderr)
+            print(f"SYNC #{count}: seq_lens_cpu lazy init triggered!", file=sys.stderr)
+            print(f"{'=' * 60}", file=sys.stderr)
+            traceback.print_stack(file=sys.stderr)
+            print(f"{'=' * 60}\n", file=sys.stderr)
+            sys.stderr.flush()
+        return original_fget(self)
+
+    # Apply patch
+    CommonAttentionMetadata.seq_lens_cpu = property(tracking_seq_lens_cpu)
+
+    class SyncTracker:
+        @property
+        def count(self) -> int:
+            return sync_count.value
+
+        def assert_no_sync(self, msg: str = ""):
+            count = sync_count.value
+            assert count == 0, (
+                f"Unexpected GPU-CPU sync: seq_lens_cpu lazy init triggered "
+                f"{count} times. See stack traces above. {msg}"
+            )
+
+    yield SyncTracker()
+
+    # Restore original property
+    CommonAttentionMetadata.seq_lens_cpu = original_prop
+    torch._dynamo.reset()
+
+
+# Test configurations: (model, spec_model, method, num_spec_tokens, backend_env)
+SPEC_DECODE_CONFIGS = [
+    pytest.param(
+        "meta-llama/Llama-3.2-1B-Instruct",
+        "nm-testing/Llama3_2_1B_speculator.eagle3",
+        "eagle3",
+        2,
+        id="eagle3-llama",
+    ),
+    pytest.param(
+        "eagle618/deepseek-v3-random",
+        "eagle618/eagle-deepseek-v3-random",
+        "eagle",
+        2,
+        id="eagle-mla-deepseek",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "model,spec_model,method,num_spec_tokens",
+    SPEC_DECODE_CONFIGS,
+)
+def test_no_sync_with_spec_decode(
+    sync_tracker,
+    model: str,
+    spec_model: str,
+    method: str,
+    num_spec_tokens: int,
+):
+    """
+    Test that no implicit GPU-CPU sync occurs during speculative decoding
+    generation.
+    """
+    # Import vLLM AFTER sync_tracker fixture has applied the patch
+    from vllm import LLM, SamplingParams
+    from vllm.distributed import cleanup_dist_env_and_memory
+
+    llm = LLM(
+        model=model,
+        max_model_len=256,
+        speculative_config={
+            "method": method,
+            "num_speculative_tokens": num_spec_tokens,
+            "model": spec_model,
+        },
+        enforce_eager=True,
+        async_scheduling=True,
+    )
+
+    outputs = llm.generate(
+        ["Hello, my name is"],
+        SamplingParams(temperature=0, max_tokens=10),
+    )
+
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs[0].text) > 0
+
+    del llm
+    torch.accelerator.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    sync_tracker.assert_no_sync()
--- a/tests/v1/e2e/spec_decode/test_lora_with_spec_decode.py
+++ b/tests/v1/e2e/spec_decode/test_lora_with_spec_decode.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This script contains:
+1. test lora with speculative decoding for batch inference
+"""
+
+import random
+
+import numpy as np
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
+
+LORA_TEST_PROMPT_MAP: dict[str, str] = {}
+
+LORA_TEST_PROMPT_MAP["premjatin/qwen-linear-algebra-coder"] = """
+### INSTRUCTION:
+You are an AI assistant that generates Python code to solve linear
+algebra problems.
+
+### PROBLEM:
+Find the eigenvalues and eigenvectors of the following 3x3 matrix:
+[[3, 2, 0],
+ [2, 3, 0],
+ [0, 0, 2]]
+
+### OUTPUT FORMAT (STRICT):
+Numbers should be represented as integers only.
+
+### PYTHON SOLUTION:
+"""
+
+SEED = 42
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
+@pytest.mark.parametrize(
+    "model_setup",
+    [
+        (
+            "eagle3",
+            "Qwen/Qwen3-1.7B",
+            "AngelSlim/Qwen3-1.7B_eagle3",
+            "premjatin/qwen-linear-algebra-coder",
+            1,
+        )
+    ],
+)
+def test_batch_inference_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    model_setup: tuple[str, str, str, str, int],
+):
+    """
+    Compare the outputs of a LLM with only Lora and a LLM with both SD and Lora.
+    Should be the same and no failure when doing batch inference.
+    model_setup: (method, model_name, spec_model_name, lora_path, tp_size)
+    """
+    with monkeypatch.context() as m:
+        # Disable randomness
+        m.setenv("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
+        torch.manual_seed(SEED)
+        np.random.seed(SEED)
+        random.seed(SEED)
+        torch.cuda.manual_seed_all(SEED)
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
+
+        method, model_name, spec_model_name, lora_path, tp_size = model_setup
+
+        # without speculative decoding
+        ref_llm = LLM(
+            model=model_name,
+            trust_remote_code=True,
+            tensor_parallel_size=tp_size,
+            max_model_len=2048,
+            max_num_seqs=4,
+            enable_lora=True,
+            max_loras=1,
+            max_cpu_loras=1,
+            max_lora_rank=16,
+        )
+
+        prompts = [LORA_TEST_PROMPT_MAP[lora_path]] * 100
+        lora_request = LoRARequest("adapter", 1, lora_path)
+        sampling_params = SamplingParams(
+            temperature=0.0, top_p=1.0, top_k=-1, seed=SEED, max_tokens=128
+        )
+
+        ref_outputs = ref_llm.generate(
+            prompts, sampling_params, lora_request=lora_request
+        )
+        del ref_llm
+        torch.accelerator.empty_cache()
+        cleanup_dist_env_and_memory()
+
+        lora_spec_llm = LLM(
+            model=model_name,
+            trust_remote_code=True,
+            tensor_parallel_size=tp_size,
+            speculative_config={
+                "method": method,
+                "model": spec_model_name,
+                "num_speculative_tokens": 3,
+                "max_model_len": 2048,
+            },
+            max_model_len=2048,
+            max_num_seqs=4,
+            enable_lora=True,
+            max_loras=1,
+            max_cpu_loras=1,
+            max_lora_rank=16,
+        )
+
+        lora_spec_outputs = lora_spec_llm.generate(
+            prompts, sampling_params, lora_request=lora_request
+        )
+
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, lora_spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 90% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        print(f"match ratio: {matches}/{len(ref_outputs)}")
+        assert matches > int(0.90 * len(ref_outputs))
+        del lora_spec_llm
+        torch.accelerator.empty_cache()
+        cleanup_dist_env_and_memory()
--- a/tests/v1/e2e/spec_decode/test_spec_decode.py
+++ b/tests/v1/e2e/spec_decode/test_spec_decode.py