[Scheduler] Warning upon preemption and Swapping (#4647)

Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
2024-05-13 23:50:44 +09:00
parent 350f9e107f
commit e7c46b9527
7 changed files with 108 additions and 3 deletions
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -61,6 +61,10 @@ class Metrics:
            labelnames=labelnames)

        # Iteration stats
+        self.counter_num_preemption = Counter(
+            name="vllm:num_preemptions_total",
+            documentation="Cumulative number of preemption from the engine.",
+            labelnames=labelnames)
        self.counter_prompt_tokens = Counter(
            name="vllm:prompt_tokens_total",
            documentation="Number of prefill tokens processed.",
@@ -181,6 +185,7 @@ class Stats:
    num_generation_tokens_iter: int
    time_to_first_tokens_iter: List[float]
    time_per_output_tokens_iter: List[float]
+    num_preemption_iter: int

    # Request stats (should have _requests suffix)
    #   Latency
@@ -244,6 +249,8 @@ class StatLogger:
                        stats.cpu_cache_usage_sys)

        # Iteration level data
+        self._log_counter(self.metrics.counter_num_preemption,
+                          stats.num_preemption_iter)
        self._log_counter(self.metrics.counter_prompt_tokens,
                          stats.num_prompt_tokens_iter)
        self._log_counter(self.metrics.counter_generation_tokens,
@@ -336,7 +343,7 @@ class StatLogger:
                "Avg generation throughput: %.1f tokens/s, "
                "Running: %d reqs, Swapped: %d reqs, "
                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
-                "CPU KV cache usage: %.1f%%",
+                "CPU KV cache usage: %.1f%%.",
                prompt_throughput,
                generation_throughput,
                stats.num_running_sys,