[Metrics] add more metrics (#4464)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
harrywu
2024-11-12 00:17:38 +08:00
committed by GitHub
parent 2cebda42bb
commit 874f551b36
4 changed files with 437 additions and 50 deletions

View File

@@ -1672,6 +1672,7 @@ class LLMEngine:
# Iteration stats
num_prompt_tokens_iter = 0
num_generation_tokens_iter = 0
num_tokens_iter = 0
time_to_first_tokens_iter: List[float] = []
time_per_output_tokens_iter: List[float] = []
num_preemption_iter = (0 if scheduler_outputs is None else
@@ -1680,6 +1681,10 @@ class LLMEngine:
# Request stats
# Latency
time_e2e_requests: List[float] = []
time_queue_requests: List[float] = []
time_inference_requests: List[float] = []
time_prefill_requests: List[float] = []
time_decode_requests: List[float] = []
time_in_queue_requests: List[float] = []
model_forward_time_requests: List[float] = []
model_execute_time_requests: List[float] = []
@@ -1687,6 +1692,7 @@ class LLMEngine:
num_prompt_tokens_requests: List[int] = []
num_generation_tokens_requests: List[int] = []
n_requests: List[int] = []
max_num_generation_tokens_requests: List[int] = []
max_tokens_requests: List[int] = []
finished_reason_requests: List[str] = []
@@ -1777,6 +1783,18 @@ class LLMEngine:
# Latency timings
time_e2e_requests.append(now -
seq_group.metrics.arrival_time)
if (seq_group.metrics.first_scheduled_time is not None and
seq_group.metrics.first_token_time is not None):
time_queue_requests.append(
seq_group.metrics.first_scheduled_time -
seq_group.metrics.arrival_time)
time_prefill_requests.append(
seq_group.metrics.first_token_time -
seq_group.metrics.first_scheduled_time)
time_decode_requests.append(
now - seq_group.metrics.first_token_time)
time_inference_requests.append(
now - seq_group.metrics.first_scheduled_time)
if seq_group.metrics.time_in_queue is not None:
time_in_queue_requests.append(
seq_group.metrics.time_in_queue)
@@ -1793,6 +1811,9 @@ class LLMEngine:
seq.get_output_len()
for seq in seq_group.get_finished_seqs()
])
max_num_generation_tokens_requests.append(
max(seq.get_output_len()
for seq in seq_group.get_seqs()))
if seq_group.sampling_params is not None:
n_requests.append(seq_group.sampling_params.n)
max_tokens_requests.append(
@@ -1811,7 +1832,8 @@ class LLMEngine:
num_generation_tokens_iter = (
actual_num_batched_tokens - num_prompt_tokens_iter +
num_generation_tokens_from_prefill_groups)
num_tokens_iter = (num_generation_tokens_iter +
num_prompt_tokens_iter)
# Spec decode, if enabled, emits specialized metrics from the worker in
# sampler output.
if model_output and (model_output[0].spec_decode_worker_metrics
@@ -1837,6 +1859,7 @@ class LLMEngine:
# Iteration stats
num_prompt_tokens_iter=num_prompt_tokens_iter,
num_generation_tokens_iter=num_generation_tokens_iter,
num_tokens_iter=num_tokens_iter,
time_to_first_tokens_iter=time_to_first_tokens_iter,
time_per_output_tokens_iter=time_per_output_tokens_iter,
spec_decode_metrics=spec_decode_metrics,
@@ -1845,12 +1868,18 @@ class LLMEngine:
# Request stats
# Latency
time_e2e_requests=time_e2e_requests,
time_queue_requests=time_queue_requests,
time_inference_requests=time_inference_requests,
time_prefill_requests=time_prefill_requests,
time_decode_requests=time_decode_requests,
time_in_queue_requests=time_in_queue_requests,
model_forward_time_requests=model_forward_time_requests,
model_execute_time_requests=model_execute_time_requests,
# Metadata
num_prompt_tokens_requests=num_prompt_tokens_requests,
num_generation_tokens_requests=num_generation_tokens_requests,
max_num_generation_tokens_requests=
max_num_generation_tokens_requests,
n_requests=n_requests,
max_tokens_requests=max_tokens_requests,
finished_reason_requests=finished_reason_requests,