[Scheduler] Warning upon preemption and Swapping (#4647)
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
This commit is contained in:
@@ -6,6 +6,7 @@ Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
|
||||
pytest tests/basic_correctness/test_preemption.py`.
|
||||
"""
|
||||
import pytest
|
||||
from prometheus_client import REGISTRY
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
|
||||
@@ -71,6 +72,7 @@ def test_chunked_prefill_recompute(
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
def test_preemption(
|
||||
caplog_vllm,
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
@@ -87,10 +89,13 @@ def test_preemption(
|
||||
vllm_model = vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=False,
|
||||
)
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
|
||||
ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
total_preemption = (
|
||||
vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
|
||||
del vllm_model
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
@@ -100,6 +105,20 @@ def test_preemption(
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
|
||||
"is not enough KV cache space." in caplog_vllm.text)
|
||||
# Ensure the count bucket of request-level histogram metrics matches
|
||||
# the number of requests as a simple sanity check to ensure metrics are
|
||||
# generated
|
||||
preemption_metrics = None
|
||||
for m in REGISTRY.collect():
|
||||
if m.name == "vllm:num_preemptions":
|
||||
preemption_metrics = m
|
||||
assert preemption_metrics is not None
|
||||
total_recorded_preemption = 0
|
||||
for sample in preemption_metrics.samples:
|
||||
total_recorded_preemption += sample.value
|
||||
assert total_preemption == total_recorded_preemption
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@@ -107,6 +126,7 @@ def test_preemption(
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
@pytest.mark.parametrize("beam_width", [4])
|
||||
def test_swap(
|
||||
caplog_vllm,
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
@@ -122,11 +142,18 @@ def test_swap(
|
||||
max_tokens)
|
||||
del hf_model
|
||||
|
||||
vllm_model = vllm_runner(model, dtype=dtype, swap_space=10)
|
||||
vllm_model = vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
swap_space=10,
|
||||
disable_log_stats=False,
|
||||
)
|
||||
vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
|
||||
max_tokens)
|
||||
assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
|
||||
ARTIFICIAL_PREEMPTION_MAX_CNT)
|
||||
total_preemption = (
|
||||
vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
|
||||
del vllm_model
|
||||
|
||||
for i in range(len(example_prompts)):
|
||||
@@ -138,6 +165,21 @@ def test_swap(
|
||||
f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
|
||||
f"vLLM: {vllm_output_ids}")
|
||||
|
||||
assert ("is preempted by PreemptionMode.SWAP mode because there "
|
||||
"is not enough KV cache space." in caplog_vllm.text)
|
||||
# Ensure the count bucket of request-level histogram metrics matches
|
||||
# the number of requests as a simple sanity check to ensure metrics are
|
||||
# generated
|
||||
preemption_metrics = None
|
||||
for m in REGISTRY.collect():
|
||||
if m.name == "vllm:num_preemptions":
|
||||
preemption_metrics = m
|
||||
assert preemption_metrics is not None
|
||||
total_recorded_preemption = 0
|
||||
for sample in preemption_metrics.samples:
|
||||
total_recorded_preemption += sample.value
|
||||
assert total_preemption == total_recorded_preemption
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
|
||||
Reference in New Issue
Block a user