[Core][Observability] Add KV cache residency metrics (#27793)
Introduces three new Prometheus histograms for fine-grained observability of KV cache residency behavior: vllm:kv_block_lifetime_seconds — total lifetime from allocation to free vllm:kv_block_idle_before_evict_seconds — idle duration before eviction vllm:kv_block_reuse_gap_seconds — time between consecutive reuses of the same block These metrics help operators analyze KV cache efficiency, reuse patterns, and eviction timing beyond simple utilization rates. Implementation uses monotonic timestamps for accuracy, 1% sampling for minimal overhead (~48 bytes/block), and is fully thread-safe with zero runtime cost when disabled. Two new runtime flags are introduced: --kv-cache-metrics – enable KV cache residency metrics --kv-cache-metrics-sample – control sampling ratio (default: 0.01) Signed-off-by: Shivam <shivamprasad91@gmail.com>
This commit is contained in:
@@ -29,6 +29,7 @@ from vllm.v1.core.encoder_cache_manager import (
|
||||
compute_encoder_budget,
|
||||
)
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
|
||||
from vllm.v1.core.kv_cache_metrics import KVCacheMetricsCollector
|
||||
from vllm.v1.core.sched.interface import SchedulerInterface
|
||||
from vllm.v1.core.sched.output import (
|
||||
CachedRequestData,
|
||||
@@ -40,7 +41,10 @@ from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_qu
|
||||
from vllm.v1.core.sched.utils import check_stop, remove_all
|
||||
from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.metrics.stats import PrefixCacheStats, SchedulerStats
|
||||
from vllm.v1.metrics.stats import (
|
||||
PrefixCacheStats,
|
||||
SchedulerStats,
|
||||
)
|
||||
from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.spec_decode.metrics import SpecDecodingStats
|
||||
@@ -69,6 +73,12 @@ class Scheduler(SchedulerInterface):
|
||||
self.kv_events_config = vllm_config.kv_events_config
|
||||
self.parallel_config = vllm_config.parallel_config
|
||||
self.log_stats = log_stats
|
||||
self.observability_config = vllm_config.observability_config
|
||||
self.kv_metrics_collector: KVCacheMetricsCollector | None = None
|
||||
if self.observability_config.kv_cache_metrics:
|
||||
self.kv_metrics_collector = KVCacheMetricsCollector(
|
||||
self.observability_config.kv_cache_metrics_sample,
|
||||
)
|
||||
self.structured_output_manager = structured_output_manager
|
||||
self.is_encoder_decoder = vllm_config.model_config.is_encoder_decoder
|
||||
|
||||
@@ -187,6 +197,7 @@ class Scheduler(SchedulerInterface):
|
||||
dcp_world_size=self.dcp_world_size,
|
||||
pcp_world_size=self.pcp_world_size,
|
||||
hash_block_size=self.block_size,
|
||||
metrics_collector=self.kv_metrics_collector,
|
||||
)
|
||||
self.use_pp = self.parallel_config.pipeline_parallel_size > 1
|
||||
self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
|
||||
@@ -1356,14 +1367,24 @@ class Scheduler(SchedulerInterface):
|
||||
prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
|
||||
assert prefix_cache_stats is not None
|
||||
connector_prefix_cache_stats = self._make_connector_prefix_cache_stats()
|
||||
eviction_events = (
|
||||
self.kv_metrics_collector.drain_events()
|
||||
if self.kv_metrics_collector is not None
|
||||
else []
|
||||
)
|
||||
spec_stats = spec_decoding_stats
|
||||
connector_stats_payload = (
|
||||
kv_connector_stats.data if kv_connector_stats else None
|
||||
)
|
||||
return SchedulerStats(
|
||||
num_running_reqs=len(self.running),
|
||||
num_waiting_reqs=len(self.waiting),
|
||||
kv_cache_usage=self.kv_cache_manager.usage,
|
||||
prefix_cache_stats=prefix_cache_stats,
|
||||
connector_prefix_cache_stats=connector_prefix_cache_stats,
|
||||
spec_decoding_stats=spec_decoding_stats,
|
||||
kv_connector_stats=kv_connector_stats.data if kv_connector_stats else None,
|
||||
kv_cache_eviction_events=eviction_events,
|
||||
spec_decoding_stats=spec_stats,
|
||||
kv_connector_stats=connector_stats_payload,
|
||||
)
|
||||
|
||||
def make_spec_decoding_stats(
|
||||
|
||||
Reference in New Issue
Block a user