[Metrics] Add Prometheus counters for Model FLOPs Utilization (MFU) (#30950)

Export the existing Model FLOPs Utilization (MFU) metrics via Prometheus. `--enable-mfu-metrics` is required for these to be exposed. Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2026-02-23 15:01:07 +00:00
parent b95bb6927f
commit 5cc7c4452e
5 changed files with 109 additions and 1 deletions
--- a/docs/mkdocs/hooks/generate_metrics.py
+++ b/docs/mkdocs/hooks/generate_metrics.py
@@ -22,6 +22,7 @@ METRIC_SOURCE_FILES = [
        "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
        "output": "nixl_connector.inc.md",
    },
    {"path": "vllm/v1/metrics/perf.py", "output": "perf.inc.md"},
 ]
--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@@ -45,6 +45,12 @@ The following metrics are exposed:
 --8<-- "docs/generated/metrics/nixl_connector.inc.md"
 ## Model Flops Utilization (MFU) Performance Metrics
 These metrics are available via `--enable-mfu-metrics`:
 --8<-- "docs/generated/metrics/perf.inc.md"
 ## Deprecation Policy
 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -19,7 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
 from vllm.logger import init_logger
 from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
 from vllm.v1.engine import FinishReason
-from vllm.v1.metrics.perf import PerfMetricsLogging
+from vllm.v1.metrics.perf import PerfMetricsLogging, PerfMetricsProm
 from vllm.v1.metrics.prometheus import unregister_vllm_metrics
 from vllm.v1.metrics.stats import (
    CachingMetrics,
@@ -392,6 +392,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
    _histogram_cls = Histogram
    _spec_decoding_cls = SpecDecodingProm
    _kv_connector_cls = KVConnectorPrometheus
    _perf_metrics_cls = PerfMetricsProm
    def __init__(
        self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None
@@ -424,6 +425,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
        self.kv_connector_prom = self._kv_connector_cls(
            vllm_config, labelnames, per_engine_labelvalues
        )
        self.perf_metrics_prom = self._perf_metrics_cls(
            vllm_config, labelnames, per_engine_labelvalues
        )
        #
        # Scheduler state
@@ -1065,6 +1069,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
                    scheduler_stats.kv_connector_stats, engine_idx
                )
            if scheduler_stats.perf_stats is not None:
                self.perf_metrics_prom.observe(scheduler_stats.perf_stats, engine_idx)
            if (
                self.kv_cache_metrics_enabled
                and scheduler_stats.kv_cache_eviction_events
--- a/vllm/v1/metrics/perf.py
+++ b/vllm/v1/metrics/perf.py
@@ -13,6 +13,7 @@ from collections.abc import Iterable
 from dataclasses import asdict, dataclass
 from typing import Any, Protocol
 import prometheus_client
 import torch
 from pydantic import BaseModel, Field, ValidationError, model_validator
 from typing_extensions import Self
@@ -1233,6 +1234,87 @@ class PerfMetricsLogging:
        self.reset()
 #### Prometheus Integration ####
 class PerfMetricsProm:
    """Record performance metrics in Prometheus.
    Average TFLOPS (tera floating-point operations per second) can be
    calculated using a PromQL query:
      rate(vllm:estimated_flops_per_gpu_total[1m]) / 1e12
    Average memory bandwidth in GB/s can be calculated using:
      (rate(vllm:estimated_read_bytes_per_gpu_total[1m]) +
       rate(vllm:estimated_write_bytes_per_gpu_total[1m])) / 1e9
    """
    _counter_cls = prometheus_client.Counter
    def __init__(
        self,
        vllm_config: VllmConfig,
        labelnames: list[str],
        per_engine_labelvalues: dict[int, list[object]],
    ):
        counter_flops = self._counter_cls(
            name="vllm:estimated_flops_per_gpu_total",
            documentation=(
                "Estimated number of floating point operations per GPU "
                "(for Model Flops Utilization calculations)."
            ),
            labelnames=labelnames,
        )
        self.counter_flops = make_per_engine(counter_flops, per_engine_labelvalues)
        counter_read_bytes = self._counter_cls(
            name="vllm:estimated_read_bytes_per_gpu_total",
            documentation=(
                "Estimated number of bytes read from memory per GPU "
                "(for Model Flops Utilization calculations)."
            ),
            labelnames=labelnames,
        )
        self.counter_read_bytes = make_per_engine(
            counter_read_bytes, per_engine_labelvalues
        )
        counter_write_bytes = self._counter_cls(
            name="vllm:estimated_write_bytes_per_gpu_total",
            documentation=(
                "Estimated number of bytes written to memory per GPU "
                "(for Model Flops Utilization calculations)."
            ),
            labelnames=labelnames,
        )
        self.counter_write_bytes = make_per_engine(
            counter_write_bytes, per_engine_labelvalues
        )
    def observe(self, perf_stats: PerfStats, engine_idx: int = 0):
        if not (
            perf_stats.num_flops_per_gpu
            or perf_stats.num_read_bytes_per_gpu
            or perf_stats.num_write_bytes_per_gpu
        ):
            return
        self.counter_flops[engine_idx].inc(perf_stats.num_flops_per_gpu)
        self.counter_read_bytes[engine_idx].inc(perf_stats.num_read_bytes_per_gpu)
        self.counter_write_bytes[engine_idx].inc(perf_stats.num_write_bytes_per_gpu)
 def make_per_engine(
    counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[object]]
 ):
    """Create a counter for each label value."""
    return {
        idx: counter.labels(*labelvalues)
        for idx, labelvalues in per_engine_labelvalues.items()
    }
 ## util functions
--- a/vllm/v1/metrics/ray_wrappers.py
+++ b/vllm/v1/metrics/ray_wrappers.py
@@ -4,6 +4,7 @@ import time
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus
 from vllm.v1.metrics.loggers import PrometheusStatLogger
 from vllm.v1.metrics.perf import PerfMetricsProm
 from vllm.v1.spec_decode.metrics import SpecDecodingProm
 try:
@@ -179,6 +180,16 @@ class RayKVConnectorPrometheus(KVConnectorPrometheus):
    _histogram_cls = RayHistogramWrapper
 class RayPerfMetricsProm(PerfMetricsProm):
    """
    RayPerfMetricsProm is used by RayMetrics to log Ray
    metrics. Provides the same MFU metrics as PerfMetricsProm
    uses Ray's util.metrics library.
    """
    _counter_cls = RayCounterWrapper
 class RayPrometheusStatLogger(PrometheusStatLogger):
    """RayPrometheusStatLogger uses Ray metrics instead."""
@@ -187,6 +198,7 @@ class RayPrometheusStatLogger(PrometheusStatLogger):
    _histogram_cls = RayHistogramWrapper
    _spec_decoding_cls = RaySpecDecodingProm
    _kv_connector_cls = RayKVConnectorPrometheus
    _perf_metrics_cls = RayPerfMetricsProm
    @staticmethod
    def _unregister_vllm_metrics():