[Metrics] Add Prometheus counters for Model FLOPs Utilization (MFU) (#30950)
Export the existing Model FLOPs Utilization (MFU) metrics via Prometheus. `--enable-mfu-metrics` is required for these to be exposed. Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
@@ -22,6 +22,7 @@ METRIC_SOURCE_FILES = [
|
|||||||
"path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
|
"path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
|
||||||
"output": "nixl_connector.inc.md",
|
"output": "nixl_connector.inc.md",
|
||||||
},
|
},
|
||||||
|
{"path": "vllm/v1/metrics/perf.py", "output": "perf.inc.md"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -45,6 +45,12 @@ The following metrics are exposed:
|
|||||||
|
|
||||||
--8<-- "docs/generated/metrics/nixl_connector.inc.md"
|
--8<-- "docs/generated/metrics/nixl_connector.inc.md"
|
||||||
|
|
||||||
|
## Model Flops Utilization (MFU) Performance Metrics
|
||||||
|
|
||||||
|
These metrics are available via `--enable-mfu-metrics`:
|
||||||
|
|
||||||
|
--8<-- "docs/generated/metrics/perf.inc.md"
|
||||||
|
|
||||||
## Deprecation Policy
|
## Deprecation Policy
|
||||||
|
|
||||||
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
|
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
|
from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
|
||||||
from vllm.v1.engine import FinishReason
|
from vllm.v1.engine import FinishReason
|
||||||
from vllm.v1.metrics.perf import PerfMetricsLogging
|
from vllm.v1.metrics.perf import PerfMetricsLogging, PerfMetricsProm
|
||||||
from vllm.v1.metrics.prometheus import unregister_vllm_metrics
|
from vllm.v1.metrics.prometheus import unregister_vllm_metrics
|
||||||
from vllm.v1.metrics.stats import (
|
from vllm.v1.metrics.stats import (
|
||||||
CachingMetrics,
|
CachingMetrics,
|
||||||
@@ -392,6 +392,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
|||||||
_histogram_cls = Histogram
|
_histogram_cls = Histogram
|
||||||
_spec_decoding_cls = SpecDecodingProm
|
_spec_decoding_cls = SpecDecodingProm
|
||||||
_kv_connector_cls = KVConnectorPrometheus
|
_kv_connector_cls = KVConnectorPrometheus
|
||||||
|
_perf_metrics_cls = PerfMetricsProm
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None
|
self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None
|
||||||
@@ -424,6 +425,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
|||||||
self.kv_connector_prom = self._kv_connector_cls(
|
self.kv_connector_prom = self._kv_connector_cls(
|
||||||
vllm_config, labelnames, per_engine_labelvalues
|
vllm_config, labelnames, per_engine_labelvalues
|
||||||
)
|
)
|
||||||
|
self.perf_metrics_prom = self._perf_metrics_cls(
|
||||||
|
vllm_config, labelnames, per_engine_labelvalues
|
||||||
|
)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Scheduler state
|
# Scheduler state
|
||||||
@@ -1065,6 +1069,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
|||||||
scheduler_stats.kv_connector_stats, engine_idx
|
scheduler_stats.kv_connector_stats, engine_idx
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if scheduler_stats.perf_stats is not None:
|
||||||
|
self.perf_metrics_prom.observe(scheduler_stats.perf_stats, engine_idx)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.kv_cache_metrics_enabled
|
self.kv_cache_metrics_enabled
|
||||||
and scheduler_stats.kv_cache_eviction_events
|
and scheduler_stats.kv_cache_eviction_events
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ from collections.abc import Iterable
|
|||||||
from dataclasses import asdict, dataclass
|
from dataclasses import asdict, dataclass
|
||||||
from typing import Any, Protocol
|
from typing import Any, Protocol
|
||||||
|
|
||||||
|
import prometheus_client
|
||||||
import torch
|
import torch
|
||||||
from pydantic import BaseModel, Field, ValidationError, model_validator
|
from pydantic import BaseModel, Field, ValidationError, model_validator
|
||||||
from typing_extensions import Self
|
from typing_extensions import Self
|
||||||
@@ -1233,6 +1234,87 @@ class PerfMetricsLogging:
|
|||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
|
|
||||||
|
#### Prometheus Integration ####
|
||||||
|
|
||||||
|
|
||||||
|
class PerfMetricsProm:
|
||||||
|
"""Record performance metrics in Prometheus.
|
||||||
|
|
||||||
|
Average TFLOPS (tera floating-point operations per second) can be
|
||||||
|
calculated using a PromQL query:
|
||||||
|
|
||||||
|
rate(vllm:estimated_flops_per_gpu_total[1m]) / 1e12
|
||||||
|
|
||||||
|
Average memory bandwidth in GB/s can be calculated using:
|
||||||
|
|
||||||
|
(rate(vllm:estimated_read_bytes_per_gpu_total[1m]) +
|
||||||
|
rate(vllm:estimated_write_bytes_per_gpu_total[1m])) / 1e9
|
||||||
|
"""
|
||||||
|
|
||||||
|
_counter_cls = prometheus_client.Counter
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vllm_config: VllmConfig,
|
||||||
|
labelnames: list[str],
|
||||||
|
per_engine_labelvalues: dict[int, list[object]],
|
||||||
|
):
|
||||||
|
counter_flops = self._counter_cls(
|
||||||
|
name="vllm:estimated_flops_per_gpu_total",
|
||||||
|
documentation=(
|
||||||
|
"Estimated number of floating point operations per GPU "
|
||||||
|
"(for Model Flops Utilization calculations)."
|
||||||
|
),
|
||||||
|
labelnames=labelnames,
|
||||||
|
)
|
||||||
|
self.counter_flops = make_per_engine(counter_flops, per_engine_labelvalues)
|
||||||
|
|
||||||
|
counter_read_bytes = self._counter_cls(
|
||||||
|
name="vllm:estimated_read_bytes_per_gpu_total",
|
||||||
|
documentation=(
|
||||||
|
"Estimated number of bytes read from memory per GPU "
|
||||||
|
"(for Model Flops Utilization calculations)."
|
||||||
|
),
|
||||||
|
labelnames=labelnames,
|
||||||
|
)
|
||||||
|
self.counter_read_bytes = make_per_engine(
|
||||||
|
counter_read_bytes, per_engine_labelvalues
|
||||||
|
)
|
||||||
|
|
||||||
|
counter_write_bytes = self._counter_cls(
|
||||||
|
name="vllm:estimated_write_bytes_per_gpu_total",
|
||||||
|
documentation=(
|
||||||
|
"Estimated number of bytes written to memory per GPU "
|
||||||
|
"(for Model Flops Utilization calculations)."
|
||||||
|
),
|
||||||
|
labelnames=labelnames,
|
||||||
|
)
|
||||||
|
self.counter_write_bytes = make_per_engine(
|
||||||
|
counter_write_bytes, per_engine_labelvalues
|
||||||
|
)
|
||||||
|
|
||||||
|
def observe(self, perf_stats: PerfStats, engine_idx: int = 0):
|
||||||
|
if not (
|
||||||
|
perf_stats.num_flops_per_gpu
|
||||||
|
or perf_stats.num_read_bytes_per_gpu
|
||||||
|
or perf_stats.num_write_bytes_per_gpu
|
||||||
|
):
|
||||||
|
return
|
||||||
|
self.counter_flops[engine_idx].inc(perf_stats.num_flops_per_gpu)
|
||||||
|
self.counter_read_bytes[engine_idx].inc(perf_stats.num_read_bytes_per_gpu)
|
||||||
|
self.counter_write_bytes[engine_idx].inc(perf_stats.num_write_bytes_per_gpu)
|
||||||
|
|
||||||
|
|
||||||
|
def make_per_engine(
|
||||||
|
counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[object]]
|
||||||
|
):
|
||||||
|
"""Create a counter for each label value."""
|
||||||
|
return {
|
||||||
|
idx: counter.labels(*labelvalues)
|
||||||
|
for idx, labelvalues in per_engine_labelvalues.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
## util functions
|
## util functions
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import time
|
|||||||
|
|
||||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus
|
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus
|
||||||
from vllm.v1.metrics.loggers import PrometheusStatLogger
|
from vllm.v1.metrics.loggers import PrometheusStatLogger
|
||||||
|
from vllm.v1.metrics.perf import PerfMetricsProm
|
||||||
from vllm.v1.spec_decode.metrics import SpecDecodingProm
|
from vllm.v1.spec_decode.metrics import SpecDecodingProm
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -179,6 +180,16 @@ class RayKVConnectorPrometheus(KVConnectorPrometheus):
|
|||||||
_histogram_cls = RayHistogramWrapper
|
_histogram_cls = RayHistogramWrapper
|
||||||
|
|
||||||
|
|
||||||
|
class RayPerfMetricsProm(PerfMetricsProm):
|
||||||
|
"""
|
||||||
|
RayPerfMetricsProm is used by RayMetrics to log Ray
|
||||||
|
metrics. Provides the same MFU metrics as PerfMetricsProm
|
||||||
|
uses Ray's util.metrics library.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_counter_cls = RayCounterWrapper
|
||||||
|
|
||||||
|
|
||||||
class RayPrometheusStatLogger(PrometheusStatLogger):
|
class RayPrometheusStatLogger(PrometheusStatLogger):
|
||||||
"""RayPrometheusStatLogger uses Ray metrics instead."""
|
"""RayPrometheusStatLogger uses Ray metrics instead."""
|
||||||
|
|
||||||
@@ -187,6 +198,7 @@ class RayPrometheusStatLogger(PrometheusStatLogger):
|
|||||||
_histogram_cls = RayHistogramWrapper
|
_histogram_cls = RayHistogramWrapper
|
||||||
_spec_decoding_cls = RaySpecDecodingProm
|
_spec_decoding_cls = RaySpecDecodingProm
|
||||||
_kv_connector_cls = RayKVConnectorPrometheus
|
_kv_connector_cls = RayKVConnectorPrometheus
|
||||||
|
_perf_metrics_cls = RayPerfMetricsProm
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _unregister_vllm_metrics():
|
def _unregister_vllm_metrics():
|
||||||
|
|||||||
Reference in New Issue
Block a user