[Misc] Add OpenTelemetry support (#4687)
This PR adds basic support for OpenTelemetry distributed tracing. It includes changes to enable tracing functionality and improve monitoring capabilities. I've also added a markdown with print-screens to guide users how to use this feature. You can find it here
This commit is contained in:
@@ -244,6 +244,9 @@ class _AsyncLLMEngine(LLMEngine):
|
||||
# Log stats.
|
||||
self.do_log_stats(scheduler_outputs, output)
|
||||
|
||||
# Tracing
|
||||
self.do_tracing(scheduler_outputs)
|
||||
|
||||
if not request_outputs:
|
||||
# Stop the execute model loop in parallel workers until there are
|
||||
# more requests to process. This avoids waiting indefinitely in
|
||||
@@ -285,6 +288,7 @@ class _AsyncLLMEngine(LLMEngine):
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
arrival_time: Optional[float] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
trace_headers: Optional[Dict[str, str]] = None,
|
||||
) -> None:
|
||||
if lora_request is not None and not self.lora_config:
|
||||
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
||||
@@ -301,6 +305,7 @@ class _AsyncLLMEngine(LLMEngine):
|
||||
params=params,
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
|
||||
async def check_health_async(self) -> None:
|
||||
@@ -556,6 +561,7 @@ class AsyncLLMEngine:
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
arrival_time: Optional[float] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
trace_headers: Optional[Dict[str, str]] = None,
|
||||
) -> AsyncStream:
|
||||
if self.log_requests:
|
||||
if isinstance(inputs, str):
|
||||
@@ -597,6 +603,7 @@ class AsyncLLMEngine:
|
||||
params=params,
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
|
||||
return stream
|
||||
@@ -607,6 +614,7 @@ class AsyncLLMEngine:
|
||||
sampling_params: SamplingParams,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
trace_headers: Optional[Dict[str, str]] = None,
|
||||
) -> AsyncIterator[RequestOutput]:
|
||||
"""Generate outputs for a request.
|
||||
|
||||
@@ -621,6 +629,7 @@ class AsyncLLMEngine:
|
||||
sampling_params: The sampling parameters of the request.
|
||||
request_id: The unique id of the request.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
trace_headers: OpenTelemetry trace headers.
|
||||
|
||||
Yields:
|
||||
The output `RequestOutput` objects from the LLMEngine
|
||||
@@ -674,6 +683,7 @@ class AsyncLLMEngine:
|
||||
inputs,
|
||||
sampling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
):
|
||||
yield LLMEngine.validate_output(output, RequestOutput)
|
||||
|
||||
@@ -683,6 +693,7 @@ class AsyncLLMEngine:
|
||||
pooling_params: PoolingParams,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
trace_headers: Optional[Dict[str, str]] = None,
|
||||
) -> AsyncIterator[EmbeddingRequestOutput]:
|
||||
"""Generate outputs for a request from an embedding model.
|
||||
|
||||
@@ -697,6 +708,7 @@ class AsyncLLMEngine:
|
||||
pooling_params: The pooling parameters of the request.
|
||||
request_id: The unique id of the request.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
trace_headers: OpenTelemetry trace headers.
|
||||
|
||||
Yields:
|
||||
The output `EmbeddingRequestOutput` objects from the LLMEngine
|
||||
@@ -748,6 +760,7 @@ class AsyncLLMEngine:
|
||||
inputs,
|
||||
pooling_params,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
):
|
||||
yield LLMEngine.validate_output(output, EmbeddingRequestOutput)
|
||||
|
||||
@@ -758,6 +771,7 @@ class AsyncLLMEngine:
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
*,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
trace_headers: Optional[Dict[str, str]] = None,
|
||||
) -> AsyncIterator[Union[RequestOutput, EmbeddingRequestOutput]]:
|
||||
"""Common logic to process requests with SamplingParams or
|
||||
PoolingParams."""
|
||||
@@ -769,6 +783,7 @@ class AsyncLLMEngine:
|
||||
params,
|
||||
arrival_time=arrival_time,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -848,3 +863,10 @@ class AsyncLLMEngine:
|
||||
else:
|
||||
await self.engine.check_health_async()
|
||||
logger.debug("Health check took %fs", time.perf_counter() - t)
|
||||
|
||||
async def is_tracing_enabled(self) -> bool:
|
||||
if self.engine_use_ray:
|
||||
return await self.engine.is_tracing_enabled.remote( # type: ignore
|
||||
)
|
||||
else:
|
||||
return self.engine.is_tracing_enabled()
|
||||
|
||||
Reference in New Issue
Block a user