[CI] Split V1 e2e + engine (1 GPU) into separate jobs (#36945)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
0
tests/v1/e2e/spec_decode/__init__.py
Normal file
0
tests/v1/e2e/spec_decode/__init__.py
Normal file
131
tests/v1/e2e/spec_decode/test_async_spec_decode.py
Normal file
131
tests/v1/e2e/spec_decode/test_async_spec_decode.py
Normal file
@@ -0,0 +1,131 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Test that verifies no implicit GPU-CPU synchronization occurs during
|
||||
speculative decoding generation under expected conditions.
|
||||
"""
|
||||
|
||||
import multiprocessing
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sync_tracker():
|
||||
"""
|
||||
Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
|
||||
lazy init syncs. Prints stack traces immediately when syncs occur.
|
||||
"""
|
||||
from vllm.v1.attention.backend import CommonAttentionMetadata
|
||||
|
||||
# Shared counter for cross-process communication (inherited by fork)
|
||||
sync_count = multiprocessing.Value("i", 0)
|
||||
|
||||
# Save original property
|
||||
original_prop = CommonAttentionMetadata.seq_lens_cpu
|
||||
original_fget = original_prop.fget
|
||||
|
||||
# Create tracking wrapper
|
||||
def tracking_seq_lens_cpu(self):
|
||||
if self._seq_lens_cpu is None:
|
||||
# Increment counter
|
||||
with sync_count.get_lock():
|
||||
sync_count.value += 1
|
||||
count = sync_count.value
|
||||
# Print stack trace immediately (shows in subprocess output)
|
||||
print(f"\n{'=' * 60}", file=sys.stderr)
|
||||
print(f"SYNC #{count}: seq_lens_cpu lazy init triggered!", file=sys.stderr)
|
||||
print(f"{'=' * 60}", file=sys.stderr)
|
||||
traceback.print_stack(file=sys.stderr)
|
||||
print(f"{'=' * 60}\n", file=sys.stderr)
|
||||
sys.stderr.flush()
|
||||
return original_fget(self)
|
||||
|
||||
# Apply patch
|
||||
CommonAttentionMetadata.seq_lens_cpu = property(tracking_seq_lens_cpu)
|
||||
|
||||
class SyncTracker:
|
||||
@property
|
||||
def count(self) -> int:
|
||||
return sync_count.value
|
||||
|
||||
def assert_no_sync(self, msg: str = ""):
|
||||
count = sync_count.value
|
||||
assert count == 0, (
|
||||
f"Unexpected GPU-CPU sync: seq_lens_cpu lazy init triggered "
|
||||
f"{count} times. See stack traces above. {msg}"
|
||||
)
|
||||
|
||||
yield SyncTracker()
|
||||
|
||||
# Restore original property
|
||||
CommonAttentionMetadata.seq_lens_cpu = original_prop
|
||||
torch._dynamo.reset()
|
||||
|
||||
|
||||
# Test configurations: (model, spec_model, method, num_spec_tokens, backend_env)
|
||||
SPEC_DECODE_CONFIGS = [
|
||||
pytest.param(
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"nm-testing/Llama3_2_1B_speculator.eagle3",
|
||||
"eagle3",
|
||||
2,
|
||||
id="eagle3-llama",
|
||||
),
|
||||
pytest.param(
|
||||
"eagle618/deepseek-v3-random",
|
||||
"eagle618/eagle-deepseek-v3-random",
|
||||
"eagle",
|
||||
2,
|
||||
id="eagle-mla-deepseek",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model,spec_model,method,num_spec_tokens",
|
||||
SPEC_DECODE_CONFIGS,
|
||||
)
|
||||
def test_no_sync_with_spec_decode(
|
||||
sync_tracker,
|
||||
model: str,
|
||||
spec_model: str,
|
||||
method: str,
|
||||
num_spec_tokens: int,
|
||||
):
|
||||
"""
|
||||
Test that no implicit GPU-CPU sync occurs during speculative decoding
|
||||
generation.
|
||||
"""
|
||||
# Import vLLM AFTER sync_tracker fixture has applied the patch
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
llm = LLM(
|
||||
model=model,
|
||||
max_model_len=256,
|
||||
speculative_config={
|
||||
"method": method,
|
||||
"num_speculative_tokens": num_spec_tokens,
|
||||
"model": spec_model,
|
||||
},
|
||||
enforce_eager=True,
|
||||
async_scheduling=True,
|
||||
)
|
||||
|
||||
outputs = llm.generate(
|
||||
["Hello, my name is"],
|
||||
SamplingParams(temperature=0, max_tokens=10),
|
||||
)
|
||||
|
||||
assert len(outputs) == 1
|
||||
assert len(outputs[0].outputs[0].text) > 0
|
||||
|
||||
del llm
|
||||
torch.accelerator.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
sync_tracker.assert_no_sync()
|
||||
139
tests/v1/e2e/spec_decode/test_lora_with_spec_decode.py
Normal file
139
tests/v1/e2e/spec_decode/test_lora_with_spec_decode.py
Normal file
@@ -0,0 +1,139 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This script contains:
|
||||
1. test lora with speculative decoding for batch inference
|
||||
"""
|
||||
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
LORA_TEST_PROMPT_MAP: dict[str, str] = {}
|
||||
|
||||
LORA_TEST_PROMPT_MAP["premjatin/qwen-linear-algebra-coder"] = """
|
||||
### INSTRUCTION:
|
||||
You are an AI assistant that generates Python code to solve linear
|
||||
algebra problems.
|
||||
|
||||
### PROBLEM:
|
||||
Find the eigenvalues and eigenvectors of the following 3x3 matrix:
|
||||
[[3, 2, 0],
|
||||
[2, 3, 0],
|
||||
[0, 0, 2]]
|
||||
|
||||
### OUTPUT FORMAT (STRICT):
|
||||
Numbers should be represented as integers only.
|
||||
|
||||
### PYTHON SOLUTION:
|
||||
"""
|
||||
|
||||
SEED = 42
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cuda(), reason="CUDA not available")
|
||||
@pytest.mark.parametrize(
|
||||
"model_setup",
|
||||
[
|
||||
(
|
||||
"eagle3",
|
||||
"Qwen/Qwen3-1.7B",
|
||||
"AngelSlim/Qwen3-1.7B_eagle3",
|
||||
"premjatin/qwen-linear-algebra-coder",
|
||||
1,
|
||||
)
|
||||
],
|
||||
)
|
||||
def test_batch_inference_correctness(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
model_setup: tuple[str, str, str, str, int],
|
||||
):
|
||||
"""
|
||||
Compare the outputs of a LLM with only Lora and a LLM with both SD and Lora.
|
||||
Should be the same and no failure when doing batch inference.
|
||||
model_setup: (method, model_name, spec_model_name, lora_path, tp_size)
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
# Disable randomness
|
||||
m.setenv("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
|
||||
torch.manual_seed(SEED)
|
||||
np.random.seed(SEED)
|
||||
random.seed(SEED)
|
||||
torch.cuda.manual_seed_all(SEED)
|
||||
torch.backends.cudnn.benchmark = False
|
||||
torch.backends.cudnn.deterministic = True
|
||||
|
||||
method, model_name, spec_model_name, lora_path, tp_size = model_setup
|
||||
|
||||
# without speculative decoding
|
||||
ref_llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
tensor_parallel_size=tp_size,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=4,
|
||||
enable_lora=True,
|
||||
max_loras=1,
|
||||
max_cpu_loras=1,
|
||||
max_lora_rank=16,
|
||||
)
|
||||
|
||||
prompts = [LORA_TEST_PROMPT_MAP[lora_path]] * 100
|
||||
lora_request = LoRARequest("adapter", 1, lora_path)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0, top_p=1.0, top_k=-1, seed=SEED, max_tokens=128
|
||||
)
|
||||
|
||||
ref_outputs = ref_llm.generate(
|
||||
prompts, sampling_params, lora_request=lora_request
|
||||
)
|
||||
del ref_llm
|
||||
torch.accelerator.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
lora_spec_llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
tensor_parallel_size=tp_size,
|
||||
speculative_config={
|
||||
"method": method,
|
||||
"model": spec_model_name,
|
||||
"num_speculative_tokens": 3,
|
||||
"max_model_len": 2048,
|
||||
},
|
||||
max_model_len=2048,
|
||||
max_num_seqs=4,
|
||||
enable_lora=True,
|
||||
max_loras=1,
|
||||
max_cpu_loras=1,
|
||||
max_lora_rank=16,
|
||||
)
|
||||
|
||||
lora_spec_outputs = lora_spec_llm.generate(
|
||||
prompts, sampling_params, lora_request=lora_request
|
||||
)
|
||||
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, lora_spec_outputs):
|
||||
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output.outputs[0].text}")
|
||||
print(f"spec_output: {spec_output.outputs[0].text}")
|
||||
|
||||
# Heuristic: expect at least 90% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
print(f"match ratio: {matches}/{len(ref_outputs)}")
|
||||
assert matches > int(0.90 * len(ref_outputs))
|
||||
del lora_spec_llm
|
||||
torch.accelerator.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
1033
tests/v1/e2e/spec_decode/test_spec_decode.py
Normal file
1033
tests/v1/e2e/spec_decode/test_spec_decode.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user