[Metrics] Model FLOPs Utilization estimation (#30738)

Signed-off-by: SungMinCho <tjdals4565@gmail.com>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
SungMinCho
2025-12-17 17:40:51 -08:00
committed by GitHub
parent ed2897f336
commit a0b782f9cc
8 changed files with 2186 additions and 2 deletions

View File

@@ -43,6 +43,7 @@ from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_qu
from vllm.v1.core.sched.utils import check_stop, remove_all
from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.metrics.perf import ModelMetrics, PerfStats
from vllm.v1.metrics.stats import (
PrefixCacheStats,
SchedulerStats,
@@ -219,6 +220,10 @@ class Scheduler(SchedulerInterface):
self.use_pp = self.parallel_config.pipeline_parallel_size > 1
self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
self.perf_metrics: ModelMetrics | None = None
if self.log_stats and vllm_config.observability_config.enable_mfu_metrics:
self.perf_metrics = ModelMetrics(vllm_config)
def schedule(self) -> SchedulerOutput:
# NOTE(woosuk) on the scheduling algorithm:
# There's no "decoding phase" nor "prefill phase" in the scheduler.
@@ -1066,6 +1071,10 @@ class Scheduler(SchedulerInterface):
kv_connector_output = model_runner_output.kv_connector_output
cudagraph_stats = model_runner_output.cudagraph_stats
perf_stats: PerfStats | None = None
if self.perf_metrics and self.perf_metrics.is_enabled():
perf_stats = self.perf_metrics.get_step_perf_stats_per_gpu(scheduler_output)
outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list)
spec_decoding_stats: SpecDecodingStats | None = None
kv_connector_stats: KVConnectorStats | None = (
@@ -1262,7 +1271,7 @@ class Scheduler(SchedulerInterface):
if (
stats := self.make_stats(
spec_decoding_stats, kv_connector_stats, cudagraph_stats
spec_decoding_stats, kv_connector_stats, cudagraph_stats, perf_stats
)
) is not None:
# Return stats to only one of the front-ends.
@@ -1485,6 +1494,7 @@ class Scheduler(SchedulerInterface):
spec_decoding_stats: SpecDecodingStats | None = None,
kv_connector_stats: KVConnectorStats | None = None,
cudagraph_stats: CUDAGraphStat | None = None,
perf_stats: PerfStats | None = None,
) -> SchedulerStats | None:
if not self.log_stats:
return None
@@ -1510,6 +1520,7 @@ class Scheduler(SchedulerInterface):
spec_decoding_stats=spec_stats,
kv_connector_stats=connector_stats_payload,
cudagraph_stats=cudagraph_stats,
perf_stats=perf_stats,
)
def make_spec_decoding_stats(