[Core][Observability] Add KV cache residency metrics (#27793)

Introduces three new Prometheus histograms for fine-grained observability of KV cache residency behavior:

vllm:kv_block_lifetime_seconds — total lifetime from allocation to free
vllm:kv_block_idle_before_evict_seconds — idle duration before eviction
vllm:kv_block_reuse_gap_seconds — time between consecutive reuses of the same block

These metrics help operators analyze KV cache efficiency, reuse patterns, and eviction timing beyond simple utilization rates.

Implementation uses monotonic timestamps for accuracy, 1% sampling for minimal overhead (~48 bytes/block), and is fully thread-safe with zero runtime cost when disabled.

Two new runtime flags are introduced:

--kv-cache-metrics – enable KV cache residency metrics
--kv-cache-metrics-sample – control sampling ratio (default: 0.01)

Signed-off-by: Shivam <shivamprasad91@gmail.com>
This commit is contained in:
shivampr
2025-12-01 10:27:53 -08:00
committed by GitHub
parent ec7035c9d4
commit cabc77cc86
11 changed files with 534 additions and 13 deletions

View File

@@ -29,6 +29,7 @@ from vllm.v1.core.encoder_cache_manager import (
compute_encoder_budget,
)
from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
from vllm.v1.core.kv_cache_metrics import KVCacheMetricsCollector
from vllm.v1.core.sched.interface import SchedulerInterface
from vllm.v1.core.sched.output import (
CachedRequestData,
@@ -40,7 +41,10 @@ from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_qu
from vllm.v1.core.sched.utils import check_stop, remove_all
from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.metrics.stats import PrefixCacheStats, SchedulerStats
from vllm.v1.metrics.stats import (
PrefixCacheStats,
SchedulerStats,
)
from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.spec_decode.metrics import SpecDecodingStats
@@ -69,6 +73,12 @@ class Scheduler(SchedulerInterface):
self.kv_events_config = vllm_config.kv_events_config
self.parallel_config = vllm_config.parallel_config
self.log_stats = log_stats
self.observability_config = vllm_config.observability_config
self.kv_metrics_collector: KVCacheMetricsCollector | None = None
if self.observability_config.kv_cache_metrics:
self.kv_metrics_collector = KVCacheMetricsCollector(
self.observability_config.kv_cache_metrics_sample,
)
self.structured_output_manager = structured_output_manager
self.is_encoder_decoder = vllm_config.model_config.is_encoder_decoder
@@ -187,6 +197,7 @@ class Scheduler(SchedulerInterface):
dcp_world_size=self.dcp_world_size,
pcp_world_size=self.pcp_world_size,
hash_block_size=self.block_size,
metrics_collector=self.kv_metrics_collector,
)
self.use_pp = self.parallel_config.pipeline_parallel_size > 1
self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
@@ -1356,14 +1367,24 @@ class Scheduler(SchedulerInterface):
prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
assert prefix_cache_stats is not None
connector_prefix_cache_stats = self._make_connector_prefix_cache_stats()
eviction_events = (
self.kv_metrics_collector.drain_events()
if self.kv_metrics_collector is not None
else []
)
spec_stats = spec_decoding_stats
connector_stats_payload = (
kv_connector_stats.data if kv_connector_stats else None
)
return SchedulerStats(
num_running_reqs=len(self.running),
num_waiting_reqs=len(self.waiting),
kv_cache_usage=self.kv_cache_manager.usage,
prefix_cache_stats=prefix_cache_stats,
connector_prefix_cache_stats=connector_prefix_cache_stats,
spec_decoding_stats=spec_decoding_stats,
kv_connector_stats=kv_connector_stats.data if kv_connector_stats else None,
kv_cache_eviction_events=eviction_events,
spec_decoding_stats=spec_stats,
kv_connector_stats=connector_stats_payload,
)
def make_spec_decoding_stats(