[Attention] Update tests to remove deprecated env vars (#30563)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
This commit is contained in:
@@ -28,7 +28,7 @@ IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
|
||||
BACKENDS,
|
||||
)
|
||||
def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
|
||||
backend, monkeypatch: pytest.MonkeyPatch
|
||||
backend,
|
||||
):
|
||||
"""
|
||||
Ensures that the same request (the 'needle' prompt) yields identical output
|
||||
@@ -54,7 +54,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
|
||||
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
|
||||
random.seed(seed)
|
||||
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
|
||||
attention_config = {"backend": backend}
|
||||
# Allow overrides from environment (useful for CI tuning)
|
||||
# "facebook/opt-125m" is too small, doesn't reliably test determinism
|
||||
model = resolve_model_name(backend)
|
||||
@@ -92,6 +92,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
|
||||
max_num_seqs=max_batch_size,
|
||||
gpu_memory_utilization=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
attention_config=attention_config,
|
||||
)
|
||||
|
||||
# Baseline generation for the needle prompt alone.
|
||||
@@ -106,6 +107,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
|
||||
max_num_seqs=max_batch_size,
|
||||
gpu_memory_utilization=gpu_mem_util,
|
||||
max_model_len=max_model_len,
|
||||
attention_config=attention_config,
|
||||
)
|
||||
|
||||
mismatches = 0
|
||||
@@ -163,10 +165,8 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
|
||||
BACKENDS,
|
||||
)
|
||||
def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
backend, monkeypatch: pytest.MonkeyPatch
|
||||
backend,
|
||||
):
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
|
||||
|
||||
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
|
||||
random.seed(seed)
|
||||
model_name = resolve_model_name(backend)
|
||||
@@ -193,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
dtype="bfloat16", # not everything is supported
|
||||
gpu_memory_utilization=0.9,
|
||||
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||
attention_config={"backend": backend},
|
||||
)
|
||||
|
||||
# Use more realistic prompts for better token generation
|
||||
@@ -381,12 +382,11 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
|
||||
"backend",
|
||||
BACKENDS,
|
||||
)
|
||||
def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
|
||||
def test_simple_generation(backend):
|
||||
"""
|
||||
Simple test that runs the model with a basic prompt and prints the output.
|
||||
Useful for quick smoke testing and debugging.
|
||||
"""
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
|
||||
model = resolve_model_name(backend)
|
||||
|
||||
llm = LLM(
|
||||
@@ -398,6 +398,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
|
||||
dtype="bfloat16",
|
||||
enable_prefix_caching=False,
|
||||
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||
attention_config={"backend": backend},
|
||||
)
|
||||
|
||||
prompt = "the capital of france is"
|
||||
@@ -444,8 +445,6 @@ def test_logprobs_without_batch_invariance_should_fail(
|
||||
The test will PASS if we detect differences (proving batch invariance matters).
|
||||
The test will FAIL if everything matches (suggesting batch invariance isn't needed).
|
||||
"""
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
|
||||
|
||||
# CRITICAL: Disable batch invariance for this test
|
||||
monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
|
||||
monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
|
||||
@@ -465,6 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail(
|
||||
max_model_len=8192,
|
||||
dtype="bfloat16",
|
||||
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||
attention_config={"backend": backend},
|
||||
)
|
||||
|
||||
# build ragged prompts to change shapes significantly across BS=1 vs BS=N
|
||||
@@ -649,7 +649,7 @@ def test_logprobs_without_batch_invariance_should_fail(
|
||||
@skip_unsupported
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
|
||||
def test_decode_logprobs_match_prefill_logprobs(
|
||||
backend, monkeypatch: pytest.MonkeyPatch
|
||||
backend,
|
||||
):
|
||||
"""
|
||||
Test that verifies decode logprobs match prefill logprobs.
|
||||
@@ -664,8 +664,6 @@ def test_decode_logprobs_match_prefill_logprobs(
|
||||
This ensures that the logprobs from decode are consistent with what
|
||||
we would get if we ran prefill on each prefix.
|
||||
"""
|
||||
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
|
||||
|
||||
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
|
||||
random.seed(seed)
|
||||
model_name = resolve_model_name(backend)
|
||||
@@ -689,6 +687,7 @@ def test_decode_logprobs_match_prefill_logprobs(
|
||||
max_model_len=8192,
|
||||
dtype="bfloat16",
|
||||
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||
attention_config={"backend": backend},
|
||||
)
|
||||
|
||||
# Use a few test prompts
|
||||
@@ -920,6 +919,7 @@ def LLM_with_max_seqs(
|
||||
max_num_seqs: int,
|
||||
gpu_memory_utilization: float,
|
||||
max_model_len: int,
|
||||
attention_config: dict | None = None,
|
||||
) -> LLM:
|
||||
"""
|
||||
Helper to construct an LLM with a specific max_num_seqs (batch-size limit)
|
||||
@@ -934,6 +934,7 @@ def LLM_with_max_seqs(
|
||||
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
|
||||
enable_prefix_caching=False,
|
||||
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
|
||||
attention_config=attention_config,
|
||||
# Enable for MOE models
|
||||
# enable_expert_parallel=True,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user