[V1][Core][1/n] Logging and Metrics (#11962)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
This commit is contained in:
Robert Shaw
2025-01-12 16:02:02 -05:00
committed by GitHub
parent 263a870ee1
commit 9597a095f2
11 changed files with 129 additions and 84 deletions

View File

@@ -4,7 +4,6 @@ from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
from vllm.config import ModelConfig, VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.metrics_types import StatLoggerBase
from vllm.engine.protocol import EngineClient
from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
from vllm.inputs.preprocess import InputPreprocessor
@@ -22,6 +21,8 @@ from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.engine.detokenizer import Detokenizer
from vllm.v1.engine.processor import Processor
from vllm.v1.executor.abstract import Executor
from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase
from vllm.v1.metrics.stats import SchedulerStats
logger = init_logger(__name__)
@@ -34,7 +35,6 @@ class AsyncLLM(EngineClient):
executor_class: Type[Executor],
log_stats: bool,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
input_registry: InputRegistry = INPUT_REGISTRY,
use_cached_outputs: bool = False,
log_requests: bool = True,
@@ -45,7 +45,10 @@ class AsyncLLM(EngineClient):
self.log_requests = log_requests
self.log_stats = log_stats
self.stat_loggers = stat_loggers
self.stat_loggers: List[StatLoggerBase] = [
LoggingStatLogger(),
# TODO(rob): PrometheusStatLogger(),
]
self.model_config = vllm_config.model_config
# Tokenizer (+ ensure liveness if running in another process).
@@ -82,7 +85,6 @@ class AsyncLLM(EngineClient):
asyncio_mode=True,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=self.log_stats,
)
self.output_handler: Optional[asyncio.Task] = None
@@ -94,7 +96,6 @@ class AsyncLLM(EngineClient):
engine_config: Optional[VllmConfig] = None,
start_engine_loop: bool = True,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
) -> "AsyncLLM":
"""Create an AsyncLLM from the EngineArgs."""
@@ -114,7 +115,6 @@ class AsyncLLM(EngineClient):
log_stats=not engine_args.disable_log_stats,
start_engine_loop=start_engine_loop,
usage_context=usage_context,
stat_loggers=stat_loggers,
)
def shutdown(self):
@@ -254,7 +254,8 @@ class AsyncLLM(EngineClient):
outputs = await self.engine_core.get_output_async()
# 2) Detokenize based on the output.
request_outputs, reqs_to_abort = self.detokenizer.step(outputs)
request_outputs, reqs_to_abort = self.detokenizer.step(
outputs.outputs)
# 3) Put the RequestOutputs into the per-request queues.
self._process_request_outputs(request_outputs)
@@ -262,6 +263,9 @@ class AsyncLLM(EngineClient):
# 4) Abort any requests that finished due to stop strings.
await self.engine_core.abort_requests_async(reqs_to_abort)
# 5) Log any stats.
await self._log_stats(scheduler_stats=outputs.scheduler_stats)
except Exception as e:
logger.exception("EngineCore output handler hit an error: %s", e)
kill_process_tree(os.getpid())
@@ -278,6 +282,14 @@ class AsyncLLM(EngineClient):
if request_id in self.rid_to_queue:
del self.rid_to_queue[request_id]
async def _log_stats(self, scheduler_stats: SchedulerStats):
"""Log stats to the stat loggers."""
if not self.log_stats:
return
for logger in self.stat_loggers:
logger.log(scheduler_stats=scheduler_stats)
def encode(
self,
prompt: PromptType,