[Scheduler] Warning upon preemption and Swapping (#4647)

Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-05-13 23:50:44 +09:00
parent 350f9e107f
commit e7c46b9527
7 changed files with 108 additions and 3 deletions
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -6,6 +6,7 @@ Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
 pytest tests/basic_correctness/test_preemption.py`.
 """
 import pytest
+from prometheus_client import REGISTRY

 from vllm import SamplingParams
 from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
@@ -71,6 +72,7 @@ def test_chunked_prefill_recompute(
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [96])
 def test_preemption(
+    caplog_vllm,
    hf_runner,
    vllm_runner,
    example_prompts,
@@ -87,10 +89,13 @@ def test_preemption(
    vllm_model = vllm_runner(
        model,
        dtype=dtype,
+        disable_log_stats=False,
    )
    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    total_preemption = (
+        vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
    del vllm_model

    for i in range(len(example_prompts)):
@@ -100,6 +105,20 @@ def test_preemption(
            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
        assert hf_output_ids == vllm_output_ids, (
            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
+            "is not enough KV cache space." in caplog_vllm.text)
+    # Ensure the count bucket of request-level histogram metrics matches
+    # the number of requests as a simple sanity check to ensure metrics are
+    # generated
+    preemption_metrics = None
+    for m in REGISTRY.collect():
+        if m.name == "vllm:num_preemptions":
+            preemption_metrics = m
+    assert preemption_metrics is not None
+    total_recorded_preemption = 0
+    for sample in preemption_metrics.samples:
+        total_recorded_preemption += sample.value
+    assert total_preemption == total_recorded_preemption


@pytest.mark.parametrize("model", MODELS)
@@ -107,6 +126,7 @@ def test_preemption(
@pytest.mark.parametrize("max_tokens", [96])
@pytest.mark.parametrize("beam_width", [4])
 def test_swap(
+    caplog_vllm,
    hf_runner,
    vllm_runner,
    example_prompts,
@@ -122,11 +142,18 @@ def test_swap(
                                               max_tokens)
    del hf_model

-    vllm_model = vllm_runner(model, dtype=dtype, swap_space=10)
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        swap_space=10,
+        disable_log_stats=False,
+    )
    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
                                                   max_tokens)
    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    total_preemption = (
+        vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
    del vllm_model

    for i in range(len(example_prompts)):
@@ -138,6 +165,21 @@ def test_swap(
                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
                f"vLLM: {vllm_output_ids}")

+    assert ("is preempted by PreemptionMode.SWAP mode because there "
+            "is not enough KV cache space." in caplog_vllm.text)
+    # Ensure the count bucket of request-level histogram metrics matches
+    # the number of requests as a simple sanity check to ensure metrics are
+    # generated
+    preemption_metrics = None
+    for m in REGISTRY.collect():
+        if m.name == "vllm:num_preemptions":
+            preemption_metrics = m
+    assert preemption_metrics is not None
+    total_recorded_preemption = 0
+    for sample in preemption_metrics.samples:
+        total_recorded_preemption += sample.value
+    assert total_preemption == total_recorded_preemption
+

@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])