Add Production Metrics in Prometheus format (#1890)
This commit is contained in:
@@ -7,6 +7,7 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
|
||||
SchedulerConfig)
|
||||
from vllm.core.scheduler import Scheduler, SchedulerOutputs
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.metrics import record_metrics
|
||||
from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import RequestOutput
|
||||
@@ -591,8 +592,8 @@ class LLMEngine:
|
||||
else:
|
||||
self.num_generation_tokens.append((now, num_batched_tokens))
|
||||
|
||||
elapsed_time = now - self.last_logging_time
|
||||
if elapsed_time < _LOGGING_INTERVAL_SEC:
|
||||
should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
|
||||
if not should_log:
|
||||
return
|
||||
|
||||
# Discard the old stats.
|
||||
@@ -631,6 +632,16 @@ class LLMEngine:
|
||||
else:
|
||||
cpu_cache_usage = 0.0
|
||||
|
||||
record_metrics(
|
||||
avg_prompt_throughput=avg_prompt_throughput,
|
||||
avg_generation_throughput=avg_generation_throughput,
|
||||
scheduler_running=len(self.scheduler.running),
|
||||
scheduler_swapped=len(self.scheduler.swapped),
|
||||
scheduler_waiting=len(self.scheduler.waiting),
|
||||
gpu_cache_usage=gpu_cache_usage,
|
||||
cpu_cache_usage=cpu_cache_usage,
|
||||
)
|
||||
|
||||
logger.info("Avg prompt throughput: "
|
||||
f"{avg_prompt_throughput:.1f} tokens/s, "
|
||||
"Avg generation throughput: "
|
||||
|
||||
Reference in New Issue
Block a user