From 5cc7c4452e48b4492c47ff7e130751d7a786dbf9 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Mon, 23 Feb 2026 15:01:07 +0000 Subject: [PATCH] [Metrics] Add Prometheus counters for Model FLOPs Utilization (MFU) (#30950) Export the existing Model FLOPs Utilization (MFU) metrics via Prometheus. `--enable-mfu-metrics` is required for these to be exposed. Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Mark McLoughlin --- docs/mkdocs/hooks/generate_metrics.py | 1 + docs/usage/metrics.md | 6 ++ vllm/v1/metrics/loggers.py | 9 ++- vllm/v1/metrics/perf.py | 82 +++++++++++++++++++++++++++ vllm/v1/metrics/ray_wrappers.py | 12 ++++ 5 files changed, 109 insertions(+), 1 deletion(-) diff --git a/docs/mkdocs/hooks/generate_metrics.py b/docs/mkdocs/hooks/generate_metrics.py index 9cbf63599..4565861c4 100644 --- a/docs/mkdocs/hooks/generate_metrics.py +++ b/docs/mkdocs/hooks/generate_metrics.py @@ -22,6 +22,7 @@ METRIC_SOURCE_FILES = [ "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py", "output": "nixl_connector.inc.md", }, + {"path": "vllm/v1/metrics/perf.py", "output": "perf.inc.md"}, ] diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md index 421d5df4a..44c9c7cbf 100644 --- a/docs/usage/metrics.md +++ b/docs/usage/metrics.md @@ -45,6 +45,12 @@ The following metrics are exposed: --8<-- "docs/generated/metrics/nixl_connector.inc.md" +## Model Flops Utilization (MFU) Performance Metrics + +These metrics are available via `--enable-mfu-metrics`: + +--8<-- "docs/generated/metrics/perf.inc.md" + ## Deprecation Policy Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 229b5742d..f20d78542 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -19,7 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( from vllm.logger import init_logger from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group from vllm.v1.engine import FinishReason -from vllm.v1.metrics.perf import PerfMetricsLogging +from vllm.v1.metrics.perf import PerfMetricsLogging, PerfMetricsProm from vllm.v1.metrics.prometheus import unregister_vllm_metrics from vllm.v1.metrics.stats import ( CachingMetrics, @@ -392,6 +392,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase): _histogram_cls = Histogram _spec_decoding_cls = SpecDecodingProm _kv_connector_cls = KVConnectorPrometheus + _perf_metrics_cls = PerfMetricsProm def __init__( self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None @@ -424,6 +425,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.kv_connector_prom = self._kv_connector_cls( vllm_config, labelnames, per_engine_labelvalues ) + self.perf_metrics_prom = self._perf_metrics_cls( + vllm_config, labelnames, per_engine_labelvalues + ) # # Scheduler state @@ -1065,6 +1069,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase): scheduler_stats.kv_connector_stats, engine_idx ) + if scheduler_stats.perf_stats is not None: + self.perf_metrics_prom.observe(scheduler_stats.perf_stats, engine_idx) + if ( self.kv_cache_metrics_enabled and scheduler_stats.kv_cache_eviction_events diff --git a/vllm/v1/metrics/perf.py b/vllm/v1/metrics/perf.py index 2b2d44069..8b4c419ae 100644 --- a/vllm/v1/metrics/perf.py +++ b/vllm/v1/metrics/perf.py @@ -13,6 +13,7 @@ from collections.abc import Iterable from dataclasses import asdict, dataclass from typing import Any, Protocol +import prometheus_client import torch from pydantic import BaseModel, Field, ValidationError, model_validator from typing_extensions import Self @@ -1233,6 +1234,87 @@ class PerfMetricsLogging: self.reset() +#### Prometheus Integration #### + + +class PerfMetricsProm: + """Record performance metrics in Prometheus. + + Average TFLOPS (tera floating-point operations per second) can be + calculated using a PromQL query: + + rate(vllm:estimated_flops_per_gpu_total[1m]) / 1e12 + + Average memory bandwidth in GB/s can be calculated using: + + (rate(vllm:estimated_read_bytes_per_gpu_total[1m]) + + rate(vllm:estimated_write_bytes_per_gpu_total[1m])) / 1e9 + """ + + _counter_cls = prometheus_client.Counter + + def __init__( + self, + vllm_config: VllmConfig, + labelnames: list[str], + per_engine_labelvalues: dict[int, list[object]], + ): + counter_flops = self._counter_cls( + name="vllm:estimated_flops_per_gpu_total", + documentation=( + "Estimated number of floating point operations per GPU " + "(for Model Flops Utilization calculations)." + ), + labelnames=labelnames, + ) + self.counter_flops = make_per_engine(counter_flops, per_engine_labelvalues) + + counter_read_bytes = self._counter_cls( + name="vllm:estimated_read_bytes_per_gpu_total", + documentation=( + "Estimated number of bytes read from memory per GPU " + "(for Model Flops Utilization calculations)." + ), + labelnames=labelnames, + ) + self.counter_read_bytes = make_per_engine( + counter_read_bytes, per_engine_labelvalues + ) + + counter_write_bytes = self._counter_cls( + name="vllm:estimated_write_bytes_per_gpu_total", + documentation=( + "Estimated number of bytes written to memory per GPU " + "(for Model Flops Utilization calculations)." + ), + labelnames=labelnames, + ) + self.counter_write_bytes = make_per_engine( + counter_write_bytes, per_engine_labelvalues + ) + + def observe(self, perf_stats: PerfStats, engine_idx: int = 0): + if not ( + perf_stats.num_flops_per_gpu + or perf_stats.num_read_bytes_per_gpu + or perf_stats.num_write_bytes_per_gpu + ): + return + self.counter_flops[engine_idx].inc(perf_stats.num_flops_per_gpu) + self.counter_read_bytes[engine_idx].inc(perf_stats.num_read_bytes_per_gpu) + self.counter_write_bytes[engine_idx].inc(perf_stats.num_write_bytes_per_gpu) + + +def make_per_engine( + counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[object]] +): + """Create a counter for each label value.""" + return { + idx: counter.labels(*labelvalues) + for idx, labelvalues in per_engine_labelvalues.items() + } + + ## util functions diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index 4b46669d5..abc53f380 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -4,6 +4,7 @@ import time from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus from vllm.v1.metrics.loggers import PrometheusStatLogger +from vllm.v1.metrics.perf import PerfMetricsProm from vllm.v1.spec_decode.metrics import SpecDecodingProm try: @@ -179,6 +180,16 @@ class RayKVConnectorPrometheus(KVConnectorPrometheus): _histogram_cls = RayHistogramWrapper +class RayPerfMetricsProm(PerfMetricsProm): + """ + RayPerfMetricsProm is used by RayMetrics to log Ray + metrics. Provides the same MFU metrics as PerfMetricsProm + uses Ray's util.metrics library. + """ + + _counter_cls = RayCounterWrapper + + class RayPrometheusStatLogger(PrometheusStatLogger): """RayPrometheusStatLogger uses Ray metrics instead.""" @@ -187,6 +198,7 @@ class RayPrometheusStatLogger(PrometheusStatLogger): _histogram_cls = RayHistogramWrapper _spec_decoding_cls = RaySpecDecodingProm _kv_connector_cls = RayKVConnectorPrometheus + _perf_metrics_cls = RayPerfMetricsProm @staticmethod def _unregister_vllm_metrics():