[Metrics] add more metrics (#4464)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Robert Shaw <rshaw@neuralmagic.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -1672,6 +1672,7 @@ class LLMEngine:
|
||||
# Iteration stats
|
||||
num_prompt_tokens_iter = 0
|
||||
num_generation_tokens_iter = 0
|
||||
num_tokens_iter = 0
|
||||
time_to_first_tokens_iter: List[float] = []
|
||||
time_per_output_tokens_iter: List[float] = []
|
||||
num_preemption_iter = (0 if scheduler_outputs is None else
|
||||
@@ -1680,6 +1681,10 @@ class LLMEngine:
|
||||
# Request stats
|
||||
# Latency
|
||||
time_e2e_requests: List[float] = []
|
||||
time_queue_requests: List[float] = []
|
||||
time_inference_requests: List[float] = []
|
||||
time_prefill_requests: List[float] = []
|
||||
time_decode_requests: List[float] = []
|
||||
time_in_queue_requests: List[float] = []
|
||||
model_forward_time_requests: List[float] = []
|
||||
model_execute_time_requests: List[float] = []
|
||||
@@ -1687,6 +1692,7 @@ class LLMEngine:
|
||||
num_prompt_tokens_requests: List[int] = []
|
||||
num_generation_tokens_requests: List[int] = []
|
||||
n_requests: List[int] = []
|
||||
max_num_generation_tokens_requests: List[int] = []
|
||||
max_tokens_requests: List[int] = []
|
||||
finished_reason_requests: List[str] = []
|
||||
|
||||
@@ -1777,6 +1783,18 @@ class LLMEngine:
|
||||
# Latency timings
|
||||
time_e2e_requests.append(now -
|
||||
seq_group.metrics.arrival_time)
|
||||
if (seq_group.metrics.first_scheduled_time is not None and
|
||||
seq_group.metrics.first_token_time is not None):
|
||||
time_queue_requests.append(
|
||||
seq_group.metrics.first_scheduled_time -
|
||||
seq_group.metrics.arrival_time)
|
||||
time_prefill_requests.append(
|
||||
seq_group.metrics.first_token_time -
|
||||
seq_group.metrics.first_scheduled_time)
|
||||
time_decode_requests.append(
|
||||
now - seq_group.metrics.first_token_time)
|
||||
time_inference_requests.append(
|
||||
now - seq_group.metrics.first_scheduled_time)
|
||||
if seq_group.metrics.time_in_queue is not None:
|
||||
time_in_queue_requests.append(
|
||||
seq_group.metrics.time_in_queue)
|
||||
@@ -1793,6 +1811,9 @@ class LLMEngine:
|
||||
seq.get_output_len()
|
||||
for seq in seq_group.get_finished_seqs()
|
||||
])
|
||||
max_num_generation_tokens_requests.append(
|
||||
max(seq.get_output_len()
|
||||
for seq in seq_group.get_seqs()))
|
||||
if seq_group.sampling_params is not None:
|
||||
n_requests.append(seq_group.sampling_params.n)
|
||||
max_tokens_requests.append(
|
||||
@@ -1811,7 +1832,8 @@ class LLMEngine:
|
||||
num_generation_tokens_iter = (
|
||||
actual_num_batched_tokens - num_prompt_tokens_iter +
|
||||
num_generation_tokens_from_prefill_groups)
|
||||
|
||||
num_tokens_iter = (num_generation_tokens_iter +
|
||||
num_prompt_tokens_iter)
|
||||
# Spec decode, if enabled, emits specialized metrics from the worker in
|
||||
# sampler output.
|
||||
if model_output and (model_output[0].spec_decode_worker_metrics
|
||||
@@ -1837,6 +1859,7 @@ class LLMEngine:
|
||||
# Iteration stats
|
||||
num_prompt_tokens_iter=num_prompt_tokens_iter,
|
||||
num_generation_tokens_iter=num_generation_tokens_iter,
|
||||
num_tokens_iter=num_tokens_iter,
|
||||
time_to_first_tokens_iter=time_to_first_tokens_iter,
|
||||
time_per_output_tokens_iter=time_per_output_tokens_iter,
|
||||
spec_decode_metrics=spec_decode_metrics,
|
||||
@@ -1845,12 +1868,18 @@ class LLMEngine:
|
||||
# Request stats
|
||||
# Latency
|
||||
time_e2e_requests=time_e2e_requests,
|
||||
time_queue_requests=time_queue_requests,
|
||||
time_inference_requests=time_inference_requests,
|
||||
time_prefill_requests=time_prefill_requests,
|
||||
time_decode_requests=time_decode_requests,
|
||||
time_in_queue_requests=time_in_queue_requests,
|
||||
model_forward_time_requests=model_forward_time_requests,
|
||||
model_execute_time_requests=model_execute_time_requests,
|
||||
# Metadata
|
||||
num_prompt_tokens_requests=num_prompt_tokens_requests,
|
||||
num_generation_tokens_requests=num_generation_tokens_requests,
|
||||
max_num_generation_tokens_requests=
|
||||
max_num_generation_tokens_requests,
|
||||
n_requests=n_requests,
|
||||
max_tokens_requests=max_tokens_requests,
|
||||
finished_reason_requests=finished_reason_requests,
|
||||
|
||||
Reference in New Issue
Block a user