[Misc] Add OpenTelemetry support (#4687)

This PR adds basic support for OpenTelemetry distributed tracing. It includes changes to enable tracing functionality and improve monitoring capabilities. I've also added a markdown with print-screens to guide users how to use this feature. You can find it here
2024-06-18 19:17:03 +03:00
parent 13db4369d9
commit 7879f24dcc
15 changed files with 567 additions and 41 deletions
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -244,6 +244,9 @@ class _AsyncLLMEngine(LLMEngine):
        # Log stats.
        self.do_log_stats(scheduler_outputs, output)

+        # Tracing
+        self.do_tracing(scheduler_outputs)
+
        if not request_outputs:
            # Stop the execute model loop in parallel workers until there are
            # more requests to process. This avoids waiting indefinitely in
@@ -285,6 +288,7 @@ class _AsyncLLMEngine(LLMEngine):
        params: Union[SamplingParams, PoolingParams],
        arrival_time: Optional[float] = None,
        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
    ) -> None:
        if lora_request is not None and not self.lora_config:
            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
@@ -301,6 +305,7 @@ class _AsyncLLMEngine(LLMEngine):
            params=params,
            arrival_time=arrival_time,
            lora_request=lora_request,
+            trace_headers=trace_headers,
        )

    async def check_health_async(self) -> None:
@@ -556,6 +561,7 @@ class AsyncLLMEngine:
        params: Union[SamplingParams, PoolingParams],
        arrival_time: Optional[float] = None,
        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
    ) -> AsyncStream:
        if self.log_requests:
            if isinstance(inputs, str):
@@ -597,6 +603,7 @@ class AsyncLLMEngine:
            params=params,
            arrival_time=arrival_time,
            lora_request=lora_request,
+            trace_headers=trace_headers,
        )

        return stream
@@ -607,6 +614,7 @@ class AsyncLLMEngine:
        sampling_params: SamplingParams,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
    ) -> AsyncIterator[RequestOutput]:
        """Generate outputs for a request.

@@ -621,6 +629,7 @@ class AsyncLLMEngine:
            sampling_params: The sampling parameters of the request.
            request_id: The unique id of the request.
            lora_request: LoRA request to use for generation, if any.
+            trace_headers: OpenTelemetry trace headers.

        Yields:
            The output `RequestOutput` objects from the LLMEngine
@@ -674,6 +683,7 @@ class AsyncLLMEngine:
                inputs,
                sampling_params,
                lora_request=lora_request,
+                trace_headers=trace_headers,
        ):
            yield LLMEngine.validate_output(output, RequestOutput)

@@ -683,6 +693,7 @@ class AsyncLLMEngine:
        pooling_params: PoolingParams,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
    ) -> AsyncIterator[EmbeddingRequestOutput]:
        """Generate outputs for a request from an embedding model.

@@ -697,6 +708,7 @@ class AsyncLLMEngine:
            pooling_params: The pooling parameters of the request.
            request_id: The unique id of the request.
            lora_request: LoRA request to use for generation, if any.
+            trace_headers: OpenTelemetry trace headers.

        Yields:
            The output `EmbeddingRequestOutput` objects from the LLMEngine
@@ -748,6 +760,7 @@ class AsyncLLMEngine:
                inputs,
                pooling_params,
                lora_request=lora_request,
+                trace_headers=trace_headers,
        ):
            yield LLMEngine.validate_output(output, EmbeddingRequestOutput)

@@ -758,6 +771,7 @@ class AsyncLLMEngine:
        params: Union[SamplingParams, PoolingParams],
        *,
        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Dict[str, str]] = None,
    ) -> AsyncIterator[Union[RequestOutput, EmbeddingRequestOutput]]:
        """Common logic to process requests with SamplingParams or
        PoolingParams."""
@@ -769,6 +783,7 @@ class AsyncLLMEngine:
            params,
            arrival_time=arrival_time,
            lora_request=lora_request,
+            trace_headers=trace_headers,
        )

        try:
@@ -848,3 +863,10 @@ class AsyncLLMEngine:
        else:
            await self.engine.check_health_async()
        logger.debug("Health check took %fs", time.perf_counter() - t)
+
+    async def is_tracing_enabled(self) -> bool:
+        if self.engine_use_ray:
+            return await self.engine.is_tracing_enabled.remote(  # type: ignore
+            )
+        else:
+            return self.engine.is_tracing_enabled()